1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/globals.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "utilities/checkedCast.hpp" 40 #include "utilities/globalDefinitions.hpp" 41 #include "utilities/powerOfTwo.hpp" 42 #include "utilities/sizes.hpp" 43 44 #ifdef PRODUCT 45 #define BLOCK_COMMENT(str) /* nothing */ 46 #define STOP(error) stop(error) 47 #else 48 #define BLOCK_COMMENT(str) block_comment(str) 49 #define STOP(error) block_comment(error); stop(error) 50 #endif 51 52 // C2 compiled method's prolog code. 53 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) { 54 if (C->clinit_barrier_on_entry()) { 55 assert(VM_Version::supports_fast_class_init_checks(), "sanity"); 56 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started"); 57 58 Label L_skip_barrier; 59 Register klass = rscratch1; 60 61 mov_metadata(klass, C->method()->holder()->constant_encoding()); 62 clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 63 64 jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 65 66 bind(L_skip_barrier); 67 } 68 69 int framesize = C->output()->frame_size_in_bytes(); 70 int bangsize = C->output()->bang_size_in_bytes(); 71 bool fp_mode_24b = false; 72 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0; 73 74 // WARNING: Initial instruction MUST be 5 bytes or longer so that 75 // NativeJump::patch_verified_entry will be able to patch out the entry 76 // code safely. The push to verify stack depth is ok at 5 bytes, 77 // the frame allocation can be either 3 or 6 bytes. So if we don't do 78 // stack bang then we must use the 6 byte frame allocation even if 79 // we have no frame. :-( 80 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 81 82 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 83 // Remove word for return addr 84 framesize -= wordSize; 85 stack_bang_size -= wordSize; 86 87 // Calls to C2R adapters often do not accept exceptional returns. 88 // We require that their callers must bang for them. But be careful, because 89 // some VM calls (such as call site linkage) can use several kilobytes of 90 // stack. But the stack safety zone should account for that. 91 // See bugs 4446381, 4468289, 4497237. 92 if (stack_bang_size > 0) { 93 generate_stack_overflow_check(stack_bang_size); 94 95 // We always push rbp, so that on return to interpreter rbp, will be 96 // restored correctly and we can correct the stack. 97 push(rbp); 98 // Save caller's stack pointer into RBP if the frame pointer is preserved. 99 if (PreserveFramePointer) { 100 mov(rbp, rsp); 101 } 102 // Remove word for ebp 103 framesize -= wordSize; 104 105 // Create frame 106 if (framesize) { 107 subptr(rsp, framesize); 108 } 109 } else { 110 // Create frame (force generation of a 4 byte immediate value) 111 subptr_imm32(rsp, framesize); 112 113 // Save RBP register now. 114 framesize -= wordSize; 115 movptr(Address(rsp, framesize), rbp); 116 // Save caller's stack pointer into RBP if the frame pointer is preserved. 117 if (PreserveFramePointer) { 118 movptr(rbp, rsp); 119 if (framesize > 0) { 120 addptr(rbp, framesize); 121 } 122 } 123 } 124 125 if (C->needs_stack_repair()) { 126 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp) 127 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned"); 128 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize); 129 } 130 131 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 132 framesize -= wordSize; 133 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 134 } 135 136 #ifndef _LP64 137 // If method sets FPU control word do it now 138 if (fp_mode_24b) { 139 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 140 } 141 if (UseSSE >= 2 && VerifyFPU) { 142 verify_FPU(0, "FPU stack must be clean on entry"); 143 } 144 #endif 145 146 #ifdef ASSERT 147 if (VerifyStackAtCalls) { 148 Label L; 149 push(rax); 150 mov(rax, rsp); 151 andptr(rax, StackAlignmentInBytes-1); 152 cmpptr(rax, StackAlignmentInBytes-wordSize); 153 pop(rax); 154 jcc(Assembler::equal, L); 155 STOP("Stack is not properly aligned!"); 156 bind(L); 157 } 158 #endif 159 } 160 161 void C2_MacroAssembler::entry_barrier() { 162 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 163 #ifdef _LP64 164 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 165 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 166 Label dummy_slow_path; 167 Label dummy_continuation; 168 Label* slow_path = &dummy_slow_path; 169 Label* continuation = &dummy_continuation; 170 if (!Compile::current()->output()->in_scratch_emit_size()) { 171 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 172 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 173 Compile::current()->output()->add_stub(stub); 174 slow_path = &stub->entry(); 175 continuation = &stub->continuation(); 176 } 177 bs->nmethod_entry_barrier(this, slow_path, continuation); 178 } 179 #else 180 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 181 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 182 #endif 183 } 184 185 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 186 switch (vlen_in_bytes) { 187 case 4: // fall-through 188 case 8: // fall-through 189 case 16: return Assembler::AVX_128bit; 190 case 32: return Assembler::AVX_256bit; 191 case 64: return Assembler::AVX_512bit; 192 193 default: { 194 ShouldNotReachHere(); 195 return Assembler::AVX_NoVec; 196 } 197 } 198 } 199 200 #if INCLUDE_RTM_OPT 201 202 // Update rtm_counters based on abort status 203 // input: abort_status 204 // rtm_counters (RTMLockingCounters*) 205 // flags are killed 206 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 207 208 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 209 if (PrintPreciseRTMLockingStatistics) { 210 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 211 Label check_abort; 212 testl(abort_status, (1<<i)); 213 jccb(Assembler::equal, check_abort); 214 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 215 bind(check_abort); 216 } 217 } 218 } 219 220 // Branch if (random & (count-1) != 0), count is 2^n 221 // tmp, scr and flags are killed 222 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 223 assert(tmp == rax, ""); 224 assert(scr == rdx, ""); 225 rdtsc(); // modifies EDX:EAX 226 andptr(tmp, count-1); 227 jccb(Assembler::notZero, brLabel); 228 } 229 230 // Perform abort ratio calculation, set no_rtm bit if high ratio 231 // input: rtm_counters_Reg (RTMLockingCounters* address) 232 // tmpReg, rtm_counters_Reg and flags are killed 233 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 234 Register rtm_counters_Reg, 235 RTMLockingCounters* rtm_counters, 236 Metadata* method_data) { 237 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 238 239 if (RTMLockingCalculationDelay > 0) { 240 // Delay calculation 241 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr())); 242 testptr(tmpReg, tmpReg); 243 jccb(Assembler::equal, L_done); 244 } 245 // Abort ratio calculation only if abort_count > RTMAbortThreshold 246 // Aborted transactions = abort_count * 100 247 // All transactions = total_count * RTMTotalCountIncrRate 248 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 249 250 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 251 cmpptr(tmpReg, RTMAbortThreshold); 252 jccb(Assembler::below, L_check_always_rtm2); 253 imulptr(tmpReg, tmpReg, 100); 254 255 Register scrReg = rtm_counters_Reg; 256 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 257 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 258 imulptr(scrReg, scrReg, RTMAbortRatio); 259 cmpptr(tmpReg, scrReg); 260 jccb(Assembler::below, L_check_always_rtm1); 261 if (method_data != nullptr) { 262 // set rtm_state to "no rtm" in MDO 263 mov_metadata(tmpReg, method_data); 264 lock(); 265 orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM); 266 } 267 jmpb(L_done); 268 bind(L_check_always_rtm1); 269 // Reload RTMLockingCounters* address 270 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 271 bind(L_check_always_rtm2); 272 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 273 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 274 jccb(Assembler::below, L_done); 275 if (method_data != nullptr) { 276 // set rtm_state to "always rtm" in MDO 277 mov_metadata(tmpReg, method_data); 278 lock(); 279 orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM); 280 } 281 bind(L_done); 282 } 283 284 // Update counters and perform abort ratio calculation 285 // input: abort_status_Reg 286 // rtm_counters_Reg, flags are killed 287 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 288 Register rtm_counters_Reg, 289 RTMLockingCounters* rtm_counters, 290 Metadata* method_data, 291 bool profile_rtm) { 292 293 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 294 // update rtm counters based on rax value at abort 295 // reads abort_status_Reg, updates flags 296 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 297 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 298 if (profile_rtm) { 299 // Save abort status because abort_status_Reg is used by following code. 300 if (RTMRetryCount > 0) { 301 push(abort_status_Reg); 302 } 303 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 304 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 305 // restore abort status 306 if (RTMRetryCount > 0) { 307 pop(abort_status_Reg); 308 } 309 } 310 } 311 312 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 313 // inputs: retry_count_Reg 314 // : abort_status_Reg 315 // output: retry_count_Reg decremented by 1 316 // flags are killed 317 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 318 Label doneRetry; 319 assert(abort_status_Reg == rax, ""); 320 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 321 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 322 // if reason is in 0x6 and retry count != 0 then retry 323 andptr(abort_status_Reg, 0x6); 324 jccb(Assembler::zero, doneRetry); 325 testl(retry_count_Reg, retry_count_Reg); 326 jccb(Assembler::zero, doneRetry); 327 pause(); 328 decrementl(retry_count_Reg); 329 jmp(retryLabel); 330 bind(doneRetry); 331 } 332 333 // Spin and retry if lock is busy, 334 // inputs: box_Reg (monitor address) 335 // : retry_count_Reg 336 // output: retry_count_Reg decremented by 1 337 // : clear z flag if retry count exceeded 338 // tmp_Reg, scr_Reg, flags are killed 339 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 340 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 341 Label SpinLoop, SpinExit, doneRetry; 342 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 343 344 testl(retry_count_Reg, retry_count_Reg); 345 jccb(Assembler::zero, doneRetry); 346 decrementl(retry_count_Reg); 347 movptr(scr_Reg, RTMSpinLoopCount); 348 349 bind(SpinLoop); 350 pause(); 351 decrementl(scr_Reg); 352 jccb(Assembler::lessEqual, SpinExit); 353 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 354 testptr(tmp_Reg, tmp_Reg); 355 jccb(Assembler::notZero, SpinLoop); 356 357 bind(SpinExit); 358 jmp(retryLabel); 359 bind(doneRetry); 360 incrementl(retry_count_Reg); // clear z flag 361 } 362 363 // Use RTM for normal stack locks 364 // Input: objReg (object to lock) 365 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 366 Register retry_on_abort_count_Reg, 367 RTMLockingCounters* stack_rtm_counters, 368 Metadata* method_data, bool profile_rtm, 369 Label& DONE_LABEL, Label& IsInflated) { 370 assert(UseRTMForStackLocks, "why call this otherwise?"); 371 assert(tmpReg == rax, ""); 372 assert(scrReg == rdx, ""); 373 Label L_rtm_retry, L_decrement_retry, L_on_abort; 374 375 if (RTMRetryCount > 0) { 376 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 377 bind(L_rtm_retry); 378 } 379 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 380 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 381 jcc(Assembler::notZero, IsInflated); 382 383 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 384 Label L_noincrement; 385 if (RTMTotalCountIncrRate > 1) { 386 // tmpReg, scrReg and flags are killed 387 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 388 } 389 assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM"); 390 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 391 bind(L_noincrement); 392 } 393 xbegin(L_on_abort); 394 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 395 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 396 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 397 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 398 399 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 400 if (UseRTMXendForLockBusy) { 401 xend(); 402 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 403 jmp(L_decrement_retry); 404 } 405 else { 406 xabort(0); 407 } 408 bind(L_on_abort); 409 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 410 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 411 } 412 bind(L_decrement_retry); 413 if (RTMRetryCount > 0) { 414 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 415 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 416 } 417 } 418 419 // Use RTM for inflating locks 420 // inputs: objReg (object to lock) 421 // boxReg (on-stack box address (displaced header location) - KILLED) 422 // tmpReg (ObjectMonitor address + markWord::monitor_value) 423 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 424 Register scrReg, Register retry_on_busy_count_Reg, 425 Register retry_on_abort_count_Reg, 426 RTMLockingCounters* rtm_counters, 427 Metadata* method_data, bool profile_rtm, 428 Label& DONE_LABEL) { 429 assert(UseRTMLocking, "why call this otherwise?"); 430 assert(tmpReg == rax, ""); 431 assert(scrReg == rdx, ""); 432 Label L_rtm_retry, L_decrement_retry, L_on_abort; 433 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 434 435 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 436 movptr(boxReg, tmpReg); // Save ObjectMonitor address 437 438 if (RTMRetryCount > 0) { 439 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 440 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 441 bind(L_rtm_retry); 442 } 443 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 444 Label L_noincrement; 445 if (RTMTotalCountIncrRate > 1) { 446 // tmpReg, scrReg and flags are killed 447 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 448 } 449 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 450 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 451 bind(L_noincrement); 452 } 453 xbegin(L_on_abort); 454 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 455 movptr(tmpReg, Address(tmpReg, owner_offset)); 456 testptr(tmpReg, tmpReg); 457 jcc(Assembler::zero, DONE_LABEL); 458 if (UseRTMXendForLockBusy) { 459 xend(); 460 jmp(L_decrement_retry); 461 } 462 else { 463 xabort(0); 464 } 465 bind(L_on_abort); 466 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 467 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 468 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 469 } 470 if (RTMRetryCount > 0) { 471 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 472 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 473 } 474 475 movptr(tmpReg, Address(boxReg, owner_offset)) ; 476 testptr(tmpReg, tmpReg) ; 477 jccb(Assembler::notZero, L_decrement_retry) ; 478 479 // Appears unlocked - try to swing _owner from null to non-null. 480 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 481 #ifdef _LP64 482 Register threadReg = r15_thread; 483 #else 484 get_thread(scrReg); 485 Register threadReg = scrReg; 486 #endif 487 lock(); 488 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 489 490 if (RTMRetryCount > 0) { 491 // success done else retry 492 jccb(Assembler::equal, DONE_LABEL) ; 493 bind(L_decrement_retry); 494 // Spin and retry if lock is busy. 495 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 496 } 497 else { 498 bind(L_decrement_retry); 499 } 500 } 501 502 #endif // INCLUDE_RTM_OPT 503 504 // fast_lock and fast_unlock used by C2 505 506 // Because the transitions from emitted code to the runtime 507 // monitorenter/exit helper stubs are so slow it's critical that 508 // we inline both the stack-locking fast path and the inflated fast path. 509 // 510 // See also: cmpFastLock and cmpFastUnlock. 511 // 512 // What follows is a specialized inline transliteration of the code 513 // in enter() and exit(). If we're concerned about I$ bloat another 514 // option would be to emit TrySlowEnter and TrySlowExit methods 515 // at startup-time. These methods would accept arguments as 516 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 517 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 518 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 519 // In practice, however, the # of lock sites is bounded and is usually small. 520 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 521 // if the processor uses simple bimodal branch predictors keyed by EIP 522 // Since the helper routines would be called from multiple synchronization 523 // sites. 524 // 525 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 526 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 527 // to those specialized methods. That'd give us a mostly platform-independent 528 // implementation that the JITs could optimize and inline at their pleasure. 529 // Done correctly, the only time we'd need to cross to native could would be 530 // to park() or unpark() threads. We'd also need a few more unsafe operators 531 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 532 // (b) explicit barriers or fence operations. 533 // 534 // TODO: 535 // 536 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 537 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 538 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 539 // the lock operators would typically be faster than reifying Self. 540 // 541 // * Ideally I'd define the primitives as: 542 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 543 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 544 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 545 // Instead, we're stuck with a rather awkward and brittle register assignments below. 546 // Furthermore the register assignments are overconstrained, possibly resulting in 547 // sub-optimal code near the synchronization site. 548 // 549 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 550 // Alternately, use a better sp-proximity test. 551 // 552 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 553 // Either one is sufficient to uniquely identify a thread. 554 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 555 // 556 // * Intrinsify notify() and notifyAll() for the common cases where the 557 // object is locked by the calling thread but the waitlist is empty. 558 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 559 // 560 // * use jccb and jmpb instead of jcc and jmp to improve code density. 561 // But beware of excessive branch density on AMD Opterons. 562 // 563 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 564 // or failure of the fast path. If the fast path fails then we pass 565 // control to the slow path, typically in C. In fast_lock and 566 // fast_unlock we often branch to DONE_LABEL, just to find that C2 567 // will emit a conditional branch immediately after the node. 568 // So we have branches to branches and lots of ICC.ZF games. 569 // Instead, it might be better to have C2 pass a "FailureLabel" 570 // into fast_lock and fast_unlock. In the case of success, control 571 // will drop through the node. ICC.ZF is undefined at exit. 572 // In the case of failure, the node will branch directly to the 573 // FailureLabel 574 575 576 // obj: object to lock 577 // box: on-stack box address (displaced header location) - KILLED 578 // rax,: tmp -- KILLED 579 // scr: tmp -- KILLED 580 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 581 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 582 RTMLockingCounters* rtm_counters, 583 RTMLockingCounters* stack_rtm_counters, 584 Metadata* method_data, 585 bool use_rtm, bool profile_rtm) { 586 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 587 // Ensure the register assignments are disjoint 588 assert(tmpReg == rax, ""); 589 590 if (use_rtm) { 591 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 592 } else { 593 assert(cx1Reg == noreg, ""); 594 assert(cx2Reg == noreg, ""); 595 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 596 } 597 598 // Possible cases that we'll encounter in fast_lock 599 // ------------------------------------------------ 600 // * Inflated 601 // -- unlocked 602 // -- Locked 603 // = by self 604 // = by other 605 // * neutral 606 // * stack-locked 607 // -- by self 608 // = sp-proximity test hits 609 // = sp-proximity test generates false-negative 610 // -- by other 611 // 612 613 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 614 615 if (DiagnoseSyncOnValueBasedClasses != 0) { 616 load_klass(tmpReg, objReg, scrReg); 617 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 618 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 619 jcc(Assembler::notZero, DONE_LABEL); 620 } 621 622 #if INCLUDE_RTM_OPT 623 if (UseRTMForStackLocks && use_rtm) { 624 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 625 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 626 stack_rtm_counters, method_data, profile_rtm, 627 DONE_LABEL, IsInflated); 628 } 629 #endif // INCLUDE_RTM_OPT 630 631 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 632 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 633 jcc(Assembler::notZero, IsInflated); 634 635 if (LockingMode == LM_MONITOR) { 636 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 637 testptr(objReg, objReg); 638 } else { 639 assert(LockingMode == LM_LEGACY, "must be"); 640 // Attempt stack-locking ... 641 orptr (tmpReg, markWord::unlocked_value); 642 if (EnableValhalla) { 643 // Mask inline_type bit such that we go to the slow path if object is an inline type 644 andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place)); 645 } 646 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 647 lock(); 648 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 649 jcc(Assembler::equal, COUNT); // Success 650 651 // Recursive locking. 652 // The object is stack-locked: markword contains stack pointer to BasicLock. 653 // Locked by current thread if difference with current SP is less than one page. 654 subptr(tmpReg, rsp); 655 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 656 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 657 movptr(Address(boxReg, 0), tmpReg); 658 } 659 jmp(DONE_LABEL); 660 661 bind(IsInflated); 662 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 663 664 #if INCLUDE_RTM_OPT 665 // Use the same RTM locking code in 32- and 64-bit VM. 666 if (use_rtm) { 667 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 668 rtm_counters, method_data, profile_rtm, DONE_LABEL); 669 } else { 670 #endif // INCLUDE_RTM_OPT 671 672 #ifndef _LP64 673 // The object is inflated. 674 675 // boxReg refers to the on-stack BasicLock in the current frame. 676 // We'd like to write: 677 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 678 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 679 // additional latency as we have another ST in the store buffer that must drain. 680 681 // avoid ST-before-CAS 682 // register juggle because we need tmpReg for cmpxchgptr below 683 movptr(scrReg, boxReg); 684 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 685 686 // Optimistic form: consider XORL tmpReg,tmpReg 687 movptr(tmpReg, NULL_WORD); 688 689 // Appears unlocked - try to swing _owner from null to non-null. 690 // Ideally, I'd manifest "Self" with get_thread and then attempt 691 // to CAS the register containing Self into m->Owner. 692 // But we don't have enough registers, so instead we can either try to CAS 693 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 694 // we later store "Self" into m->Owner. Transiently storing a stack address 695 // (rsp or the address of the box) into m->owner is harmless. 696 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 697 lock(); 698 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 699 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 700 // If we weren't able to swing _owner from null to the BasicLock 701 // then take the slow path. 702 jccb (Assembler::notZero, NO_COUNT); 703 // update _owner from BasicLock to thread 704 get_thread (scrReg); // beware: clobbers ICCs 705 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 706 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 707 708 // If the CAS fails we can either retry or pass control to the slow path. 709 // We use the latter tactic. 710 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 711 // If the CAS was successful ... 712 // Self has acquired the lock 713 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 714 // Intentional fall-through into DONE_LABEL ... 715 #else // _LP64 716 // It's inflated and we use scrReg for ObjectMonitor* in this section. 717 movq(scrReg, tmpReg); 718 xorq(tmpReg, tmpReg); 719 lock(); 720 cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 721 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 722 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 723 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 724 // Propagate ICC.ZF from CAS above into DONE_LABEL. 725 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 726 727 cmpptr(thread, rax); // Check if we are already the owner (recursive lock) 728 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 729 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 730 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 731 #endif // _LP64 732 #if INCLUDE_RTM_OPT 733 } // use_rtm() 734 #endif 735 bind(DONE_LABEL); 736 737 // ZFlag == 1 count in fast path 738 // ZFlag == 0 count in slow path 739 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 740 741 bind(COUNT); 742 // Count monitors in fast path 743 increment(Address(thread, JavaThread::held_monitor_count_offset())); 744 745 xorl(tmpReg, tmpReg); // Set ZF == 1 746 747 bind(NO_COUNT); 748 749 // At NO_COUNT the icc ZFlag is set as follows ... 750 // fast_unlock uses the same protocol. 751 // ZFlag == 1 -> Success 752 // ZFlag == 0 -> Failure - force control through the slow path 753 } 754 755 // obj: object to unlock 756 // box: box address (displaced header location), killed. Must be EAX. 757 // tmp: killed, cannot be obj nor box. 758 // 759 // Some commentary on balanced locking: 760 // 761 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 762 // Methods that don't have provably balanced locking are forced to run in the 763 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 764 // The interpreter provides two properties: 765 // I1: At return-time the interpreter automatically and quietly unlocks any 766 // objects acquired the current activation (frame). Recall that the 767 // interpreter maintains an on-stack list of locks currently held by 768 // a frame. 769 // I2: If a method attempts to unlock an object that is not held by the 770 // the frame the interpreter throws IMSX. 771 // 772 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 773 // B() doesn't have provably balanced locking so it runs in the interpreter. 774 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 775 // is still locked by A(). 776 // 777 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 778 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 779 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 780 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 781 // Arguably given that the spec legislates the JNI case as undefined our implementation 782 // could reasonably *avoid* checking owner in fast_unlock(). 783 // In the interest of performance we elide m->Owner==Self check in unlock. 784 // A perfectly viable alternative is to elide the owner check except when 785 // Xcheck:jni is enabled. 786 787 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 788 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 789 assert(boxReg == rax, ""); 790 assert_different_registers(objReg, boxReg, tmpReg); 791 792 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 793 794 #if INCLUDE_RTM_OPT 795 if (UseRTMForStackLocks && use_rtm) { 796 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 797 Label L_regular_unlock; 798 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 799 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 800 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 801 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 802 xend(); // otherwise end... 803 jmp(DONE_LABEL); // ... and we're done 804 bind(L_regular_unlock); 805 } 806 #endif 807 808 if (LockingMode == LM_LEGACY) { 809 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 810 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 811 } 812 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 813 if (LockingMode != LM_MONITOR) { 814 testptr(tmpReg, markWord::monitor_value); // Inflated? 815 jcc(Assembler::zero, Stacked); 816 } 817 818 // It's inflated. 819 820 #if INCLUDE_RTM_OPT 821 if (use_rtm) { 822 Label L_regular_inflated_unlock; 823 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 824 movptr(boxReg, Address(tmpReg, owner_offset)); 825 testptr(boxReg, boxReg); 826 jccb(Assembler::notZero, L_regular_inflated_unlock); 827 xend(); 828 jmp(DONE_LABEL); 829 bind(L_regular_inflated_unlock); 830 } 831 #endif 832 833 // Despite our balanced locking property we still check that m->_owner == Self 834 // as java routines or native JNI code called by this thread might 835 // have released the lock. 836 // Refer to the comments in synchronizer.cpp for how we might encode extra 837 // state in _succ so we can avoid fetching EntryList|cxq. 838 // 839 // If there's no contention try a 1-0 exit. That is, exit without 840 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 841 // we detect and recover from the race that the 1-0 exit admits. 842 // 843 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 844 // before it STs null into _owner, releasing the lock. Updates 845 // to data protected by the critical section must be visible before 846 // we drop the lock (and thus before any other thread could acquire 847 // the lock and observe the fields protected by the lock). 848 // IA32's memory-model is SPO, so STs are ordered with respect to 849 // each other and there's no need for an explicit barrier (fence). 850 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 851 #ifndef _LP64 852 // Note that we could employ various encoding schemes to reduce 853 // the number of loads below (currently 4) to just 2 or 3. 854 // Refer to the comments in synchronizer.cpp. 855 // In practice the chain of fetches doesn't seem to impact performance, however. 856 xorptr(boxReg, boxReg); 857 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 858 jccb (Assembler::notZero, DONE_LABEL); 859 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 860 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 861 jccb (Assembler::notZero, DONE_LABEL); 862 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 863 jmpb (DONE_LABEL); 864 #else // _LP64 865 // It's inflated 866 Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath; 867 868 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 869 jccb(Assembler::equal, LNotRecursive); 870 871 // Recursive inflated unlock 872 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 873 jmpb(LSuccess); 874 875 bind(LNotRecursive); 876 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 877 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 878 jccb (Assembler::notZero, CheckSucc); 879 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 880 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 881 jmpb (DONE_LABEL); 882 883 // Try to avoid passing control into the slow_path ... 884 bind (CheckSucc); 885 886 // The following optional optimization can be elided if necessary 887 // Effectively: if (succ == null) goto slow path 888 // The code reduces the window for a race, however, 889 // and thus benefits performance. 890 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 891 jccb (Assembler::zero, LGoSlowPath); 892 893 xorptr(boxReg, boxReg); 894 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 895 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 896 897 // Memory barrier/fence 898 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 899 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 900 // This is faster on Nehalem and AMD Shanghai/Barcelona. 901 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 902 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 903 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 904 lock(); addl(Address(rsp, 0), 0); 905 906 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 907 jccb (Assembler::notZero, LSuccess); 908 909 // Rare inopportune interleaving - race. 910 // The successor vanished in the small window above. 911 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 912 // We need to ensure progress and succession. 913 // Try to reacquire the lock. 914 // If that fails then the new owner is responsible for succession and this 915 // thread needs to take no further action and can exit via the fast path (success). 916 // If the re-acquire succeeds then pass control into the slow path. 917 // As implemented, this latter mode is horrible because we generated more 918 // coherence traffic on the lock *and* artificially extended the critical section 919 // length while by virtue of passing control into the slow path. 920 921 // box is really RAX -- the following CMPXCHG depends on that binding 922 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 923 lock(); 924 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 925 // There's no successor so we tried to regrab the lock. 926 // If that didn't work, then another thread grabbed the 927 // lock so we're done (and exit was a success). 928 jccb (Assembler::notEqual, LSuccess); 929 // Intentional fall-through into slow path 930 931 bind (LGoSlowPath); 932 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 933 jmpb (DONE_LABEL); 934 935 bind (LSuccess); 936 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 937 jmpb (DONE_LABEL); 938 939 #endif 940 if (LockingMode == LM_LEGACY) { 941 bind (Stacked); 942 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 943 lock(); 944 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 945 // Intentional fall-thru into DONE_LABEL 946 } 947 948 bind(DONE_LABEL); 949 950 // ZFlag == 1 count in fast path 951 // ZFlag == 0 count in slow path 952 jccb(Assembler::notZero, NO_COUNT); 953 954 bind(COUNT); 955 // Count monitors in fast path 956 #ifndef _LP64 957 get_thread(tmpReg); 958 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 959 #else // _LP64 960 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 961 #endif 962 963 xorl(tmpReg, tmpReg); // Set ZF == 1 964 965 bind(NO_COUNT); 966 } 967 968 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 969 Register t, Register thread) { 970 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 971 assert(rax_reg == rax, "Used for CAS"); 972 assert_different_registers(obj, box, rax_reg, t, thread); 973 974 // Handle inflated monitor. 975 Label inflated; 976 // Finish fast lock successfully. ZF value is irrelevant. 977 Label locked; 978 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 979 Label slow_path; 980 981 if (DiagnoseSyncOnValueBasedClasses != 0) { 982 load_klass(rax_reg, obj, t); 983 movl(rax_reg, Address(rax_reg, Klass::access_flags_offset())); 984 testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS); 985 jcc(Assembler::notZero, slow_path); 986 } 987 988 const Register mark = t; 989 990 { // Lightweight Lock 991 992 Label push; 993 994 const Register top = box; 995 996 // Load the mark. 997 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 998 999 // Prefetch top. 1000 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 1001 1002 // Check for monitor (0b10). 1003 testptr(mark, markWord::monitor_value); 1004 jcc(Assembler::notZero, inflated); 1005 1006 // Check if lock-stack is full. 1007 cmpl(top, LockStack::end_offset() - 1); 1008 jcc(Assembler::greater, slow_path); 1009 1010 // Check if recursive. 1011 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 1012 jccb(Assembler::equal, push); 1013 1014 // Try to lock. Transition lock bits 0b01 => 0b00 1015 movptr(rax_reg, mark); 1016 orptr(rax_reg, markWord::unlocked_value); 1017 andptr(mark, ~(int32_t)markWord::unlocked_value); 1018 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 1019 jcc(Assembler::notEqual, slow_path); 1020 1021 bind(push); 1022 // After successful lock, push object on lock-stack. 1023 movptr(Address(thread, top), obj); 1024 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 1025 jmpb(locked); 1026 } 1027 1028 { // Handle inflated monitor. 1029 bind(inflated); 1030 1031 const Register tagged_monitor = mark; 1032 1033 // CAS owner (null => current thread). 1034 xorptr(rax_reg, rax_reg); 1035 lock(); cmpxchgptr(thread, Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 1036 jccb(Assembler::equal, locked); 1037 1038 // Check if recursive. 1039 cmpptr(thread, rax_reg); 1040 jccb(Assembler::notEqual, slow_path); 1041 1042 // Recursive. 1043 increment(Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 1044 } 1045 1046 bind(locked); 1047 increment(Address(thread, JavaThread::held_monitor_count_offset())); 1048 // Set ZF = 1 1049 xorl(rax_reg, rax_reg); 1050 1051 #ifdef ASSERT 1052 // Check that locked label is reached with ZF set. 1053 Label zf_correct; 1054 jccb(Assembler::zero, zf_correct); 1055 stop("Fast Lock ZF != 1"); 1056 #endif 1057 1058 bind(slow_path); 1059 #ifdef ASSERT 1060 // Check that slow_path label is reached with ZF not set. 1061 jccb(Assembler::notZero, zf_correct); 1062 stop("Fast Lock ZF != 0"); 1063 bind(zf_correct); 1064 #endif 1065 // C2 uses the value of ZF to determine the continuation. 1066 } 1067 1068 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 1069 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 1070 assert(reg_rax == rax, "Used for CAS"); 1071 assert_different_registers(obj, reg_rax, t); 1072 1073 // Handle inflated monitor. 1074 Label inflated, inflated_check_lock_stack; 1075 // Finish fast unlock successfully. MUST jump with ZF == 1 1076 Label unlocked; 1077 1078 // Assume success. 1079 decrement(Address(thread, JavaThread::held_monitor_count_offset())); 1080 1081 const Register mark = t; 1082 const Register top = reg_rax; 1083 1084 Label dummy; 1085 C2FastUnlockLightweightStub* stub = nullptr; 1086 1087 if (!Compile::current()->output()->in_scratch_emit_size()) { 1088 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 1089 Compile::current()->output()->add_stub(stub); 1090 } 1091 1092 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 1093 Label& check_successor = stub == nullptr ? dummy : stub->check_successor(); 1094 1095 { // Lightweight Unlock 1096 1097 // Load top. 1098 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 1099 1100 // Prefetch mark. 1101 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 1102 1103 // Check if obj is top of lock-stack. 1104 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 1105 // Top of lock stack was not obj. Must be monitor. 1106 jcc(Assembler::notEqual, inflated_check_lock_stack); 1107 1108 // Pop lock-stack. 1109 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 1110 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 1111 1112 // Check if recursive. 1113 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 1114 jcc(Assembler::equal, unlocked); 1115 1116 // We elide the monitor check, let the CAS fail instead. 1117 1118 // Try to unlock. Transition lock bits 0b00 => 0b01 1119 movptr(reg_rax, mark); 1120 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 1121 orptr(mark, markWord::unlocked_value); 1122 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 1123 jcc(Assembler::notEqual, push_and_slow_path); 1124 jmp(unlocked); 1125 } 1126 1127 1128 { // Handle inflated monitor. 1129 bind(inflated_check_lock_stack); 1130 #ifdef ASSERT 1131 Label check_done; 1132 subl(top, oopSize); 1133 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 1134 jcc(Assembler::below, check_done); 1135 cmpptr(obj, Address(thread, top)); 1136 jccb(Assembler::notEqual, inflated_check_lock_stack); 1137 stop("Fast Unlock lock on stack"); 1138 bind(check_done); 1139 testptr(mark, markWord::monitor_value); 1140 jccb(Assembler::notZero, inflated); 1141 stop("Fast Unlock not monitor"); 1142 #endif 1143 1144 bind(inflated); 1145 1146 // mark contains the tagged ObjectMonitor*. 1147 const Register monitor = mark; 1148 1149 #ifndef _LP64 1150 // Check if recursive. 1151 xorptr(reg_rax, reg_rax); 1152 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 1153 jcc(Assembler::notZero, check_successor); 1154 1155 // Check if the entry lists are empty. 1156 movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 1157 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 1158 jcc(Assembler::notZero, check_successor); 1159 1160 // Release lock. 1161 movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 1162 #else // _LP64 1163 Label recursive; 1164 1165 // Check if recursive. 1166 cmpptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 1167 jccb(Assembler::notEqual, recursive); 1168 1169 // Check if the entry lists are empty. 1170 movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 1171 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 1172 jcc(Assembler::notZero, check_successor); 1173 1174 // Release lock. 1175 movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 1176 jmpb(unlocked); 1177 1178 // Recursive unlock. 1179 bind(recursive); 1180 decrement(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 1181 xorl(t, t); 1182 #endif 1183 } 1184 1185 bind(unlocked); 1186 if (stub != nullptr) { 1187 bind(stub->unlocked_continuation()); 1188 } 1189 1190 #ifdef ASSERT 1191 // Check that unlocked label is reached with ZF set. 1192 Label zf_correct; 1193 jccb(Assembler::zero, zf_correct); 1194 stop("Fast Unlock ZF != 1"); 1195 #endif 1196 1197 if (stub != nullptr) { 1198 bind(stub->slow_path_continuation()); 1199 } 1200 #ifdef ASSERT 1201 // Check that stub->continuation() label is reached with ZF not set. 1202 jccb(Assembler::notZero, zf_correct); 1203 stop("Fast Unlock ZF != 0"); 1204 bind(zf_correct); 1205 #endif 1206 // C2 uses the value of ZF to determine the continuation. 1207 } 1208 1209 //------------------------------------------------------------------------------------------- 1210 // Generic instructions support for use in .ad files C2 code generation 1211 1212 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 1213 if (dst != src) { 1214 movdqu(dst, src); 1215 } 1216 if (opcode == Op_AbsVD) { 1217 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 1218 } else { 1219 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 1220 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1221 } 1222 } 1223 1224 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 1225 if (opcode == Op_AbsVD) { 1226 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 1227 } else { 1228 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 1229 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 1230 } 1231 } 1232 1233 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 1234 if (dst != src) { 1235 movdqu(dst, src); 1236 } 1237 if (opcode == Op_AbsVF) { 1238 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 1239 } else { 1240 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1241 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1242 } 1243 } 1244 1245 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 1246 if (opcode == Op_AbsVF) { 1247 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 1248 } else { 1249 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1250 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 1251 } 1252 } 1253 1254 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 1255 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1256 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 1257 1258 if (opcode == Op_MinV) { 1259 if (elem_bt == T_BYTE) { 1260 pminsb(dst, src); 1261 } else if (elem_bt == T_SHORT) { 1262 pminsw(dst, src); 1263 } else if (elem_bt == T_INT) { 1264 pminsd(dst, src); 1265 } else { 1266 assert(elem_bt == T_LONG, "required"); 1267 assert(tmp == xmm0, "required"); 1268 assert_different_registers(dst, src, tmp); 1269 movdqu(xmm0, dst); 1270 pcmpgtq(xmm0, src); 1271 blendvpd(dst, src); // xmm0 as mask 1272 } 1273 } else { // opcode == Op_MaxV 1274 if (elem_bt == T_BYTE) { 1275 pmaxsb(dst, src); 1276 } else if (elem_bt == T_SHORT) { 1277 pmaxsw(dst, src); 1278 } else if (elem_bt == T_INT) { 1279 pmaxsd(dst, src); 1280 } else { 1281 assert(elem_bt == T_LONG, "required"); 1282 assert(tmp == xmm0, "required"); 1283 assert_different_registers(dst, src, tmp); 1284 movdqu(xmm0, src); 1285 pcmpgtq(xmm0, dst); 1286 blendvpd(dst, src); // xmm0 as mask 1287 } 1288 } 1289 } 1290 1291 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1292 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1293 int vlen_enc) { 1294 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1295 1296 if (opcode == Op_MinV) { 1297 if (elem_bt == T_BYTE) { 1298 vpminsb(dst, src1, src2, vlen_enc); 1299 } else if (elem_bt == T_SHORT) { 1300 vpminsw(dst, src1, src2, vlen_enc); 1301 } else if (elem_bt == T_INT) { 1302 vpminsd(dst, src1, src2, vlen_enc); 1303 } else { 1304 assert(elem_bt == T_LONG, "required"); 1305 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1306 vpminsq(dst, src1, src2, vlen_enc); 1307 } else { 1308 assert_different_registers(dst, src1, src2); 1309 vpcmpgtq(dst, src1, src2, vlen_enc); 1310 vblendvpd(dst, src1, src2, dst, vlen_enc); 1311 } 1312 } 1313 } else { // opcode == Op_MaxV 1314 if (elem_bt == T_BYTE) { 1315 vpmaxsb(dst, src1, src2, vlen_enc); 1316 } else if (elem_bt == T_SHORT) { 1317 vpmaxsw(dst, src1, src2, vlen_enc); 1318 } else if (elem_bt == T_INT) { 1319 vpmaxsd(dst, src1, src2, vlen_enc); 1320 } else { 1321 assert(elem_bt == T_LONG, "required"); 1322 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1323 vpmaxsq(dst, src1, src2, vlen_enc); 1324 } else { 1325 assert_different_registers(dst, src1, src2); 1326 vpcmpgtq(dst, src1, src2, vlen_enc); 1327 vblendvpd(dst, src2, src1, dst, vlen_enc); 1328 } 1329 } 1330 } 1331 } 1332 1333 // Float/Double min max 1334 1335 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1336 XMMRegister dst, XMMRegister a, XMMRegister b, 1337 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1338 int vlen_enc) { 1339 assert(UseAVX > 0, "required"); 1340 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1341 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1342 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1343 assert_different_registers(a, tmp, atmp, btmp); 1344 assert_different_registers(b, tmp, atmp, btmp); 1345 1346 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1347 bool is_double_word = is_double_word_type(elem_bt); 1348 1349 /* Note on 'non-obvious' assembly sequence: 1350 * 1351 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1352 * and Java on how they handle floats: 1353 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1354 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1355 * 1356 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1357 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1358 * (only useful when signs differ, noop otherwise) 1359 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1360 1361 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1362 * btmp = (b < +0.0) ? a : b 1363 * atmp = (b < +0.0) ? b : a 1364 * Tmp = Max_Float(atmp , btmp) 1365 * Res = (atmp == NaN) ? atmp : Tmp 1366 */ 1367 1368 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1369 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1370 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1371 XMMRegister mask; 1372 1373 if (!is_double_word && is_min) { 1374 mask = a; 1375 vblend = &MacroAssembler::vblendvps; 1376 vmaxmin = &MacroAssembler::vminps; 1377 vcmp = &MacroAssembler::vcmpps; 1378 } else if (!is_double_word && !is_min) { 1379 mask = b; 1380 vblend = &MacroAssembler::vblendvps; 1381 vmaxmin = &MacroAssembler::vmaxps; 1382 vcmp = &MacroAssembler::vcmpps; 1383 } else if (is_double_word && is_min) { 1384 mask = a; 1385 vblend = &MacroAssembler::vblendvpd; 1386 vmaxmin = &MacroAssembler::vminpd; 1387 vcmp = &MacroAssembler::vcmppd; 1388 } else { 1389 assert(is_double_word && !is_min, "sanity"); 1390 mask = b; 1391 vblend = &MacroAssembler::vblendvpd; 1392 vmaxmin = &MacroAssembler::vmaxpd; 1393 vcmp = &MacroAssembler::vcmppd; 1394 } 1395 1396 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1397 XMMRegister maxmin, scratch; 1398 if (dst == btmp) { 1399 maxmin = btmp; 1400 scratch = tmp; 1401 } else { 1402 maxmin = tmp; 1403 scratch = btmp; 1404 } 1405 1406 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1407 if (precompute_mask && !is_double_word) { 1408 vpsrad(tmp, mask, 32, vlen_enc); 1409 mask = tmp; 1410 } else if (precompute_mask && is_double_word) { 1411 vpxor(tmp, tmp, tmp, vlen_enc); 1412 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1413 mask = tmp; 1414 } 1415 1416 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1417 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1418 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1419 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1420 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1421 } 1422 1423 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1424 XMMRegister dst, XMMRegister a, XMMRegister b, 1425 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1426 int vlen_enc) { 1427 assert(UseAVX > 2, "required"); 1428 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1429 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1430 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1431 assert_different_registers(dst, a, atmp, btmp); 1432 assert_different_registers(dst, b, atmp, btmp); 1433 1434 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1435 bool is_double_word = is_double_word_type(elem_bt); 1436 bool merge = true; 1437 1438 if (!is_double_word && is_min) { 1439 evpmovd2m(ktmp, a, vlen_enc); 1440 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1441 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1442 vminps(dst, atmp, btmp, vlen_enc); 1443 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1444 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1445 } else if (!is_double_word && !is_min) { 1446 evpmovd2m(ktmp, b, vlen_enc); 1447 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1448 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1449 vmaxps(dst, atmp, btmp, vlen_enc); 1450 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1451 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1452 } else if (is_double_word && is_min) { 1453 evpmovq2m(ktmp, a, vlen_enc); 1454 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1455 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1456 vminpd(dst, atmp, btmp, vlen_enc); 1457 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1458 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1459 } else { 1460 assert(is_double_word && !is_min, "sanity"); 1461 evpmovq2m(ktmp, b, vlen_enc); 1462 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1463 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1464 vmaxpd(dst, atmp, btmp, vlen_enc); 1465 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1466 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1467 } 1468 } 1469 1470 // Float/Double signum 1471 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1472 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1473 1474 Label DONE_LABEL; 1475 1476 if (opcode == Op_SignumF) { 1477 assert(UseSSE > 0, "required"); 1478 ucomiss(dst, zero); 1479 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1480 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1481 movflt(dst, one); 1482 jcc(Assembler::above, DONE_LABEL); 1483 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1484 } else if (opcode == Op_SignumD) { 1485 assert(UseSSE > 1, "required"); 1486 ucomisd(dst, zero); 1487 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1488 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1489 movdbl(dst, one); 1490 jcc(Assembler::above, DONE_LABEL); 1491 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1492 } 1493 1494 bind(DONE_LABEL); 1495 } 1496 1497 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1498 if (sign) { 1499 pmovsxbw(dst, src); 1500 } else { 1501 pmovzxbw(dst, src); 1502 } 1503 } 1504 1505 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1506 if (sign) { 1507 vpmovsxbw(dst, src, vector_len); 1508 } else { 1509 vpmovzxbw(dst, src, vector_len); 1510 } 1511 } 1512 1513 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1514 if (sign) { 1515 vpmovsxbd(dst, src, vector_len); 1516 } else { 1517 vpmovzxbd(dst, src, vector_len); 1518 } 1519 } 1520 1521 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1522 if (sign) { 1523 vpmovsxwd(dst, src, vector_len); 1524 } else { 1525 vpmovzxwd(dst, src, vector_len); 1526 } 1527 } 1528 1529 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1530 int shift, int vector_len) { 1531 if (opcode == Op_RotateLeftV) { 1532 if (etype == T_INT) { 1533 evprold(dst, src, shift, vector_len); 1534 } else { 1535 assert(etype == T_LONG, "expected type T_LONG"); 1536 evprolq(dst, src, shift, vector_len); 1537 } 1538 } else { 1539 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1540 if (etype == T_INT) { 1541 evprord(dst, src, shift, vector_len); 1542 } else { 1543 assert(etype == T_LONG, "expected type T_LONG"); 1544 evprorq(dst, src, shift, vector_len); 1545 } 1546 } 1547 } 1548 1549 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1550 XMMRegister shift, int vector_len) { 1551 if (opcode == Op_RotateLeftV) { 1552 if (etype == T_INT) { 1553 evprolvd(dst, src, shift, vector_len); 1554 } else { 1555 assert(etype == T_LONG, "expected type T_LONG"); 1556 evprolvq(dst, src, shift, vector_len); 1557 } 1558 } else { 1559 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1560 if (etype == T_INT) { 1561 evprorvd(dst, src, shift, vector_len); 1562 } else { 1563 assert(etype == T_LONG, "expected type T_LONG"); 1564 evprorvq(dst, src, shift, vector_len); 1565 } 1566 } 1567 } 1568 1569 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1570 if (opcode == Op_RShiftVI) { 1571 psrad(dst, shift); 1572 } else if (opcode == Op_LShiftVI) { 1573 pslld(dst, shift); 1574 } else { 1575 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1576 psrld(dst, shift); 1577 } 1578 } 1579 1580 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1581 switch (opcode) { 1582 case Op_RShiftVI: psrad(dst, shift); break; 1583 case Op_LShiftVI: pslld(dst, shift); break; 1584 case Op_URShiftVI: psrld(dst, shift); break; 1585 1586 default: assert(false, "%s", NodeClassNames[opcode]); 1587 } 1588 } 1589 1590 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1591 if (opcode == Op_RShiftVI) { 1592 vpsrad(dst, nds, shift, vector_len); 1593 } else if (opcode == Op_LShiftVI) { 1594 vpslld(dst, nds, shift, vector_len); 1595 } else { 1596 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1597 vpsrld(dst, nds, shift, vector_len); 1598 } 1599 } 1600 1601 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1602 switch (opcode) { 1603 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1604 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1605 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1606 1607 default: assert(false, "%s", NodeClassNames[opcode]); 1608 } 1609 } 1610 1611 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1612 switch (opcode) { 1613 case Op_RShiftVB: // fall-through 1614 case Op_RShiftVS: psraw(dst, shift); break; 1615 1616 case Op_LShiftVB: // fall-through 1617 case Op_LShiftVS: psllw(dst, shift); break; 1618 1619 case Op_URShiftVS: // fall-through 1620 case Op_URShiftVB: psrlw(dst, shift); break; 1621 1622 default: assert(false, "%s", NodeClassNames[opcode]); 1623 } 1624 } 1625 1626 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1627 switch (opcode) { 1628 case Op_RShiftVB: // fall-through 1629 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1630 1631 case Op_LShiftVB: // fall-through 1632 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1633 1634 case Op_URShiftVS: // fall-through 1635 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1636 1637 default: assert(false, "%s", NodeClassNames[opcode]); 1638 } 1639 } 1640 1641 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1642 switch (opcode) { 1643 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1644 case Op_LShiftVL: psllq(dst, shift); break; 1645 case Op_URShiftVL: psrlq(dst, shift); break; 1646 1647 default: assert(false, "%s", NodeClassNames[opcode]); 1648 } 1649 } 1650 1651 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1652 if (opcode == Op_RShiftVL) { 1653 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1654 } else if (opcode == Op_LShiftVL) { 1655 psllq(dst, shift); 1656 } else { 1657 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1658 psrlq(dst, shift); 1659 } 1660 } 1661 1662 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1663 switch (opcode) { 1664 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1665 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1666 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1667 1668 default: assert(false, "%s", NodeClassNames[opcode]); 1669 } 1670 } 1671 1672 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1673 if (opcode == Op_RShiftVL) { 1674 evpsraq(dst, nds, shift, vector_len); 1675 } else if (opcode == Op_LShiftVL) { 1676 vpsllq(dst, nds, shift, vector_len); 1677 } else { 1678 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1679 vpsrlq(dst, nds, shift, vector_len); 1680 } 1681 } 1682 1683 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1684 switch (opcode) { 1685 case Op_RShiftVB: // fall-through 1686 case Op_RShiftVS: // fall-through 1687 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1688 1689 case Op_LShiftVB: // fall-through 1690 case Op_LShiftVS: // fall-through 1691 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1692 1693 case Op_URShiftVB: // fall-through 1694 case Op_URShiftVS: // fall-through 1695 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1696 1697 default: assert(false, "%s", NodeClassNames[opcode]); 1698 } 1699 } 1700 1701 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1702 switch (opcode) { 1703 case Op_RShiftVB: // fall-through 1704 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1705 1706 case Op_LShiftVB: // fall-through 1707 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1708 1709 case Op_URShiftVB: // fall-through 1710 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1711 1712 default: assert(false, "%s", NodeClassNames[opcode]); 1713 } 1714 } 1715 1716 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1717 assert(UseAVX >= 2, "required"); 1718 switch (opcode) { 1719 case Op_RShiftVL: { 1720 if (UseAVX > 2) { 1721 assert(tmp == xnoreg, "not used"); 1722 if (!VM_Version::supports_avx512vl()) { 1723 vlen_enc = Assembler::AVX_512bit; 1724 } 1725 evpsravq(dst, src, shift, vlen_enc); 1726 } else { 1727 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1728 vpsrlvq(dst, src, shift, vlen_enc); 1729 vpsrlvq(tmp, tmp, shift, vlen_enc); 1730 vpxor(dst, dst, tmp, vlen_enc); 1731 vpsubq(dst, dst, tmp, vlen_enc); 1732 } 1733 break; 1734 } 1735 case Op_LShiftVL: { 1736 assert(tmp == xnoreg, "not used"); 1737 vpsllvq(dst, src, shift, vlen_enc); 1738 break; 1739 } 1740 case Op_URShiftVL: { 1741 assert(tmp == xnoreg, "not used"); 1742 vpsrlvq(dst, src, shift, vlen_enc); 1743 break; 1744 } 1745 default: assert(false, "%s", NodeClassNames[opcode]); 1746 } 1747 } 1748 1749 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1750 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1751 assert(opcode == Op_LShiftVB || 1752 opcode == Op_RShiftVB || 1753 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1754 bool sign = (opcode != Op_URShiftVB); 1755 assert(vector_len == 0, "required"); 1756 vextendbd(sign, dst, src, 1); 1757 vpmovzxbd(vtmp, shift, 1); 1758 varshiftd(opcode, dst, dst, vtmp, 1); 1759 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1760 vextracti128_high(vtmp, dst); 1761 vpackusdw(dst, dst, vtmp, 0); 1762 } 1763 1764 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1765 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1766 assert(opcode == Op_LShiftVB || 1767 opcode == Op_RShiftVB || 1768 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1769 bool sign = (opcode != Op_URShiftVB); 1770 int ext_vector_len = vector_len + 1; 1771 vextendbw(sign, dst, src, ext_vector_len); 1772 vpmovzxbw(vtmp, shift, ext_vector_len); 1773 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1774 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1775 if (vector_len == 0) { 1776 vextracti128_high(vtmp, dst); 1777 vpackuswb(dst, dst, vtmp, vector_len); 1778 } else { 1779 vextracti64x4_high(vtmp, dst); 1780 vpackuswb(dst, dst, vtmp, vector_len); 1781 vpermq(dst, dst, 0xD8, vector_len); 1782 } 1783 } 1784 1785 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1786 switch(typ) { 1787 case T_BYTE: 1788 pinsrb(dst, val, idx); 1789 break; 1790 case T_SHORT: 1791 pinsrw(dst, val, idx); 1792 break; 1793 case T_INT: 1794 pinsrd(dst, val, idx); 1795 break; 1796 case T_LONG: 1797 pinsrq(dst, val, idx); 1798 break; 1799 default: 1800 assert(false,"Should not reach here."); 1801 break; 1802 } 1803 } 1804 1805 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1806 switch(typ) { 1807 case T_BYTE: 1808 vpinsrb(dst, src, val, idx); 1809 break; 1810 case T_SHORT: 1811 vpinsrw(dst, src, val, idx); 1812 break; 1813 case T_INT: 1814 vpinsrd(dst, src, val, idx); 1815 break; 1816 case T_LONG: 1817 vpinsrq(dst, src, val, idx); 1818 break; 1819 default: 1820 assert(false,"Should not reach here."); 1821 break; 1822 } 1823 } 1824 1825 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1826 switch(typ) { 1827 case T_INT: 1828 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1829 break; 1830 case T_FLOAT: 1831 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1832 break; 1833 case T_LONG: 1834 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1835 break; 1836 case T_DOUBLE: 1837 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1838 break; 1839 default: 1840 assert(false,"Should not reach here."); 1841 break; 1842 } 1843 } 1844 1845 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1846 switch(typ) { 1847 case T_INT: 1848 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1849 break; 1850 case T_FLOAT: 1851 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1852 break; 1853 case T_LONG: 1854 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1855 break; 1856 case T_DOUBLE: 1857 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1858 break; 1859 default: 1860 assert(false,"Should not reach here."); 1861 break; 1862 } 1863 } 1864 1865 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1866 switch(typ) { 1867 case T_INT: 1868 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1869 break; 1870 case T_FLOAT: 1871 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1872 break; 1873 case T_LONG: 1874 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1875 break; 1876 case T_DOUBLE: 1877 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1878 break; 1879 default: 1880 assert(false,"Should not reach here."); 1881 break; 1882 } 1883 } 1884 1885 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1886 if (vlen_in_bytes <= 16) { 1887 pxor (dst, dst); 1888 psubb(dst, src); 1889 switch (elem_bt) { 1890 case T_BYTE: /* nothing to do */ break; 1891 case T_SHORT: pmovsxbw(dst, dst); break; 1892 case T_INT: pmovsxbd(dst, dst); break; 1893 case T_FLOAT: pmovsxbd(dst, dst); break; 1894 case T_LONG: pmovsxbq(dst, dst); break; 1895 case T_DOUBLE: pmovsxbq(dst, dst); break; 1896 1897 default: assert(false, "%s", type2name(elem_bt)); 1898 } 1899 } else { 1900 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1901 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1902 1903 vpxor (dst, dst, dst, vlen_enc); 1904 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1905 1906 switch (elem_bt) { 1907 case T_BYTE: /* nothing to do */ break; 1908 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1909 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1910 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1911 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1912 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1913 1914 default: assert(false, "%s", type2name(elem_bt)); 1915 } 1916 } 1917 } 1918 1919 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1920 if (novlbwdq) { 1921 vpmovsxbd(xtmp, src, vlen_enc); 1922 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1923 Assembler::eq, true, vlen_enc, noreg); 1924 } else { 1925 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1926 vpsubb(xtmp, xtmp, src, vlen_enc); 1927 evpmovb2m(dst, xtmp, vlen_enc); 1928 } 1929 } 1930 1931 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1932 switch (vlen_in_bytes) { 1933 case 4: movdl(dst, src); break; 1934 case 8: movq(dst, src); break; 1935 case 16: movdqu(dst, src); break; 1936 case 32: vmovdqu(dst, src); break; 1937 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1938 default: ShouldNotReachHere(); 1939 } 1940 } 1941 1942 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1943 assert(rscratch != noreg || always_reachable(src), "missing"); 1944 1945 if (reachable(src)) { 1946 load_vector(dst, as_Address(src), vlen_in_bytes); 1947 } else { 1948 lea(rscratch, src); 1949 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1950 } 1951 } 1952 1953 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1954 int vlen_enc = vector_length_encoding(vlen); 1955 if (VM_Version::supports_avx()) { 1956 if (bt == T_LONG) { 1957 if (VM_Version::supports_avx2()) { 1958 vpbroadcastq(dst, src, vlen_enc); 1959 } else { 1960 vmovddup(dst, src, vlen_enc); 1961 } 1962 } else if (bt == T_DOUBLE) { 1963 if (vlen_enc != Assembler::AVX_128bit) { 1964 vbroadcastsd(dst, src, vlen_enc, noreg); 1965 } else { 1966 vmovddup(dst, src, vlen_enc); 1967 } 1968 } else { 1969 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1970 vpbroadcastd(dst, src, vlen_enc); 1971 } else { 1972 vbroadcastss(dst, src, vlen_enc); 1973 } 1974 } 1975 } else if (VM_Version::supports_sse3()) { 1976 movddup(dst, src); 1977 } else { 1978 movq(dst, src); 1979 if (vlen == 16) { 1980 punpcklqdq(dst, dst); 1981 } 1982 } 1983 } 1984 1985 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1986 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1987 int offset = exact_log2(type2aelembytes(bt)) << 6; 1988 if (is_floating_point_type(bt)) { 1989 offset += 128; 1990 } 1991 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1992 load_vector(dst, addr, vlen_in_bytes); 1993 } 1994 1995 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1996 1997 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1998 int vector_len = Assembler::AVX_128bit; 1999 2000 switch (opcode) { 2001 case Op_AndReductionV: pand(dst, src); break; 2002 case Op_OrReductionV: por (dst, src); break; 2003 case Op_XorReductionV: pxor(dst, src); break; 2004 case Op_MinReductionV: 2005 switch (typ) { 2006 case T_BYTE: pminsb(dst, src); break; 2007 case T_SHORT: pminsw(dst, src); break; 2008 case T_INT: pminsd(dst, src); break; 2009 case T_LONG: assert(UseAVX > 2, "required"); 2010 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 2011 default: assert(false, "wrong type"); 2012 } 2013 break; 2014 case Op_MaxReductionV: 2015 switch (typ) { 2016 case T_BYTE: pmaxsb(dst, src); break; 2017 case T_SHORT: pmaxsw(dst, src); break; 2018 case T_INT: pmaxsd(dst, src); break; 2019 case T_LONG: assert(UseAVX > 2, "required"); 2020 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 2021 default: assert(false, "wrong type"); 2022 } 2023 break; 2024 case Op_AddReductionVF: addss(dst, src); break; 2025 case Op_AddReductionVD: addsd(dst, src); break; 2026 case Op_AddReductionVI: 2027 switch (typ) { 2028 case T_BYTE: paddb(dst, src); break; 2029 case T_SHORT: paddw(dst, src); break; 2030 case T_INT: paddd(dst, src); break; 2031 default: assert(false, "wrong type"); 2032 } 2033 break; 2034 case Op_AddReductionVL: paddq(dst, src); break; 2035 case Op_MulReductionVF: mulss(dst, src); break; 2036 case Op_MulReductionVD: mulsd(dst, src); break; 2037 case Op_MulReductionVI: 2038 switch (typ) { 2039 case T_SHORT: pmullw(dst, src); break; 2040 case T_INT: pmulld(dst, src); break; 2041 default: assert(false, "wrong type"); 2042 } 2043 break; 2044 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 2045 evpmullq(dst, dst, src, vector_len); break; 2046 default: assert(false, "wrong opcode"); 2047 } 2048 } 2049 2050 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 2051 int vector_len = Assembler::AVX_256bit; 2052 2053 switch (opcode) { 2054 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 2055 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 2056 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 2057 case Op_MinReductionV: 2058 switch (typ) { 2059 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 2060 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 2061 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 2062 case T_LONG: assert(UseAVX > 2, "required"); 2063 vpminsq(dst, src1, src2, vector_len); break; 2064 default: assert(false, "wrong type"); 2065 } 2066 break; 2067 case Op_MaxReductionV: 2068 switch (typ) { 2069 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 2070 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 2071 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 2072 case T_LONG: assert(UseAVX > 2, "required"); 2073 vpmaxsq(dst, src1, src2, vector_len); break; 2074 default: assert(false, "wrong type"); 2075 } 2076 break; 2077 case Op_AddReductionVI: 2078 switch (typ) { 2079 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 2080 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 2081 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 2082 default: assert(false, "wrong type"); 2083 } 2084 break; 2085 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 2086 case Op_MulReductionVI: 2087 switch (typ) { 2088 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 2089 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 2090 default: assert(false, "wrong type"); 2091 } 2092 break; 2093 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 2094 default: assert(false, "wrong opcode"); 2095 } 2096 } 2097 2098 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 2099 XMMRegister dst, XMMRegister src, 2100 XMMRegister vtmp1, XMMRegister vtmp2) { 2101 switch (opcode) { 2102 case Op_AddReductionVF: 2103 case Op_MulReductionVF: 2104 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 2105 break; 2106 2107 case Op_AddReductionVD: 2108 case Op_MulReductionVD: 2109 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 2110 break; 2111 2112 default: assert(false, "wrong opcode"); 2113 } 2114 } 2115 2116 void C2_MacroAssembler::reduceB(int opcode, int vlen, 2117 Register dst, Register src1, XMMRegister src2, 2118 XMMRegister vtmp1, XMMRegister vtmp2) { 2119 switch (vlen) { 2120 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2121 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2122 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2123 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2124 2125 default: assert(false, "wrong vector length"); 2126 } 2127 } 2128 2129 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 2130 Register dst, Register src1, XMMRegister src2, 2131 XMMRegister vtmp1, XMMRegister vtmp2) { 2132 switch (vlen) { 2133 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2134 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2135 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2136 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2137 2138 default: assert(false, "wrong vector length"); 2139 } 2140 } 2141 2142 void C2_MacroAssembler::reduceS(int opcode, int vlen, 2143 Register dst, Register src1, XMMRegister src2, 2144 XMMRegister vtmp1, XMMRegister vtmp2) { 2145 switch (vlen) { 2146 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2147 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2148 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2149 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2150 2151 default: assert(false, "wrong vector length"); 2152 } 2153 } 2154 2155 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2156 Register dst, Register src1, XMMRegister src2, 2157 XMMRegister vtmp1, XMMRegister vtmp2) { 2158 switch (vlen) { 2159 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2160 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2161 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2162 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2163 2164 default: assert(false, "wrong vector length"); 2165 } 2166 } 2167 2168 #ifdef _LP64 2169 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2170 Register dst, Register src1, XMMRegister src2, 2171 XMMRegister vtmp1, XMMRegister vtmp2) { 2172 switch (vlen) { 2173 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2174 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2175 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2176 2177 default: assert(false, "wrong vector length"); 2178 } 2179 } 2180 #endif // _LP64 2181 2182 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2183 switch (vlen) { 2184 case 2: 2185 assert(vtmp2 == xnoreg, ""); 2186 reduce2F(opcode, dst, src, vtmp1); 2187 break; 2188 case 4: 2189 assert(vtmp2 == xnoreg, ""); 2190 reduce4F(opcode, dst, src, vtmp1); 2191 break; 2192 case 8: 2193 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2194 break; 2195 case 16: 2196 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2197 break; 2198 default: assert(false, "wrong vector length"); 2199 } 2200 } 2201 2202 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2203 switch (vlen) { 2204 case 2: 2205 assert(vtmp2 == xnoreg, ""); 2206 reduce2D(opcode, dst, src, vtmp1); 2207 break; 2208 case 4: 2209 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2210 break; 2211 case 8: 2212 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2213 break; 2214 default: assert(false, "wrong vector length"); 2215 } 2216 } 2217 2218 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2219 if (opcode == Op_AddReductionVI) { 2220 if (vtmp1 != src2) { 2221 movdqu(vtmp1, src2); 2222 } 2223 phaddd(vtmp1, vtmp1); 2224 } else { 2225 pshufd(vtmp1, src2, 0x1); 2226 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2227 } 2228 movdl(vtmp2, src1); 2229 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2230 movdl(dst, vtmp1); 2231 } 2232 2233 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2234 if (opcode == Op_AddReductionVI) { 2235 if (vtmp1 != src2) { 2236 movdqu(vtmp1, src2); 2237 } 2238 phaddd(vtmp1, src2); 2239 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2240 } else { 2241 pshufd(vtmp2, src2, 0xE); 2242 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2243 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2244 } 2245 } 2246 2247 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2248 if (opcode == Op_AddReductionVI) { 2249 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2250 vextracti128_high(vtmp2, vtmp1); 2251 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2252 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2253 } else { 2254 vextracti128_high(vtmp1, src2); 2255 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2256 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2257 } 2258 } 2259 2260 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2261 vextracti64x4_high(vtmp2, src2); 2262 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2263 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2264 } 2265 2266 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2267 pshufd(vtmp2, src2, 0x1); 2268 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2269 movdqu(vtmp1, vtmp2); 2270 psrldq(vtmp1, 2); 2271 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2272 movdqu(vtmp2, vtmp1); 2273 psrldq(vtmp2, 1); 2274 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2275 movdl(vtmp2, src1); 2276 pmovsxbd(vtmp1, vtmp1); 2277 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2278 pextrb(dst, vtmp1, 0x0); 2279 movsbl(dst, dst); 2280 } 2281 2282 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2283 pshufd(vtmp1, src2, 0xE); 2284 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2285 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2286 } 2287 2288 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2289 vextracti128_high(vtmp2, src2); 2290 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2291 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2292 } 2293 2294 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2295 vextracti64x4_high(vtmp1, src2); 2296 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2297 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2298 } 2299 2300 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2301 pmovsxbw(vtmp2, src2); 2302 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2303 } 2304 2305 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2306 if (UseAVX > 1) { 2307 int vector_len = Assembler::AVX_256bit; 2308 vpmovsxbw(vtmp1, src2, vector_len); 2309 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2310 } else { 2311 pmovsxbw(vtmp2, src2); 2312 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2313 pshufd(vtmp2, src2, 0x1); 2314 pmovsxbw(vtmp2, src2); 2315 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2316 } 2317 } 2318 2319 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2320 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2321 int vector_len = Assembler::AVX_512bit; 2322 vpmovsxbw(vtmp1, src2, vector_len); 2323 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2324 } else { 2325 assert(UseAVX >= 2,"Should not reach here."); 2326 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2327 vextracti128_high(vtmp2, src2); 2328 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2329 } 2330 } 2331 2332 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2333 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2334 vextracti64x4_high(vtmp2, src2); 2335 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2336 } 2337 2338 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2339 if (opcode == Op_AddReductionVI) { 2340 if (vtmp1 != src2) { 2341 movdqu(vtmp1, src2); 2342 } 2343 phaddw(vtmp1, vtmp1); 2344 phaddw(vtmp1, vtmp1); 2345 } else { 2346 pshufd(vtmp2, src2, 0x1); 2347 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2348 movdqu(vtmp1, vtmp2); 2349 psrldq(vtmp1, 2); 2350 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2351 } 2352 movdl(vtmp2, src1); 2353 pmovsxwd(vtmp1, vtmp1); 2354 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2355 pextrw(dst, vtmp1, 0x0); 2356 movswl(dst, dst); 2357 } 2358 2359 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2360 if (opcode == Op_AddReductionVI) { 2361 if (vtmp1 != src2) { 2362 movdqu(vtmp1, src2); 2363 } 2364 phaddw(vtmp1, src2); 2365 } else { 2366 pshufd(vtmp1, src2, 0xE); 2367 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2368 } 2369 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2370 } 2371 2372 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2373 if (opcode == Op_AddReductionVI) { 2374 int vector_len = Assembler::AVX_256bit; 2375 vphaddw(vtmp2, src2, src2, vector_len); 2376 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2377 } else { 2378 vextracti128_high(vtmp2, src2); 2379 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2380 } 2381 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2382 } 2383 2384 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2385 int vector_len = Assembler::AVX_256bit; 2386 vextracti64x4_high(vtmp1, src2); 2387 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2388 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2389 } 2390 2391 #ifdef _LP64 2392 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2393 pshufd(vtmp2, src2, 0xE); 2394 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2395 movdq(vtmp1, src1); 2396 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2397 movdq(dst, vtmp1); 2398 } 2399 2400 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2401 vextracti128_high(vtmp1, src2); 2402 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2403 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2404 } 2405 2406 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2407 vextracti64x4_high(vtmp2, src2); 2408 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2409 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2410 } 2411 2412 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2413 mov64(temp, -1L); 2414 bzhiq(temp, temp, len); 2415 kmovql(dst, temp); 2416 } 2417 #endif // _LP64 2418 2419 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2420 reduce_operation_128(T_FLOAT, opcode, dst, src); 2421 pshufd(vtmp, src, 0x1); 2422 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2423 } 2424 2425 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2426 reduce2F(opcode, dst, src, vtmp); 2427 pshufd(vtmp, src, 0x2); 2428 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2429 pshufd(vtmp, src, 0x3); 2430 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2431 } 2432 2433 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2434 reduce4F(opcode, dst, src, vtmp2); 2435 vextractf128_high(vtmp2, src); 2436 reduce4F(opcode, dst, vtmp2, vtmp1); 2437 } 2438 2439 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2440 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2441 vextracti64x4_high(vtmp1, src); 2442 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2443 } 2444 2445 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2446 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2447 pshufd(vtmp, src, 0xE); 2448 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2449 } 2450 2451 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2452 reduce2D(opcode, dst, src, vtmp2); 2453 vextractf128_high(vtmp2, src); 2454 reduce2D(opcode, dst, vtmp2, vtmp1); 2455 } 2456 2457 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2458 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2459 vextracti64x4_high(vtmp1, src); 2460 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2461 } 2462 2463 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2464 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2465 } 2466 2467 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2468 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2469 } 2470 2471 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2472 int vec_enc) { 2473 switch(elem_bt) { 2474 case T_INT: 2475 case T_FLOAT: 2476 vmaskmovps(dst, src, mask, vec_enc); 2477 break; 2478 case T_LONG: 2479 case T_DOUBLE: 2480 vmaskmovpd(dst, src, mask, vec_enc); 2481 break; 2482 default: 2483 fatal("Unsupported type %s", type2name(elem_bt)); 2484 break; 2485 } 2486 } 2487 2488 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2489 int vec_enc) { 2490 switch(elem_bt) { 2491 case T_INT: 2492 case T_FLOAT: 2493 vmaskmovps(dst, src, mask, vec_enc); 2494 break; 2495 case T_LONG: 2496 case T_DOUBLE: 2497 vmaskmovpd(dst, src, mask, vec_enc); 2498 break; 2499 default: 2500 fatal("Unsupported type %s", type2name(elem_bt)); 2501 break; 2502 } 2503 } 2504 2505 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2506 XMMRegister dst, XMMRegister src, 2507 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2508 XMMRegister xmm_0, XMMRegister xmm_1) { 2509 const int permconst[] = {1, 14}; 2510 XMMRegister wsrc = src; 2511 XMMRegister wdst = xmm_0; 2512 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2513 2514 int vlen_enc = Assembler::AVX_128bit; 2515 if (vlen == 16) { 2516 vlen_enc = Assembler::AVX_256bit; 2517 } 2518 2519 for (int i = log2(vlen) - 1; i >=0; i--) { 2520 if (i == 0 && !is_dst_valid) { 2521 wdst = dst; 2522 } 2523 if (i == 3) { 2524 vextracti64x4_high(wtmp, wsrc); 2525 } else if (i == 2) { 2526 vextracti128_high(wtmp, wsrc); 2527 } else { // i = [0,1] 2528 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2529 } 2530 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2531 wsrc = wdst; 2532 vlen_enc = Assembler::AVX_128bit; 2533 } 2534 if (is_dst_valid) { 2535 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2536 } 2537 } 2538 2539 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2540 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2541 XMMRegister xmm_0, XMMRegister xmm_1) { 2542 XMMRegister wsrc = src; 2543 XMMRegister wdst = xmm_0; 2544 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2545 int vlen_enc = Assembler::AVX_128bit; 2546 if (vlen == 8) { 2547 vlen_enc = Assembler::AVX_256bit; 2548 } 2549 for (int i = log2(vlen) - 1; i >=0; i--) { 2550 if (i == 0 && !is_dst_valid) { 2551 wdst = dst; 2552 } 2553 if (i == 1) { 2554 vextracti128_high(wtmp, wsrc); 2555 } else if (i == 2) { 2556 vextracti64x4_high(wtmp, wsrc); 2557 } else { 2558 assert(i == 0, "%d", i); 2559 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2560 } 2561 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2562 wsrc = wdst; 2563 vlen_enc = Assembler::AVX_128bit; 2564 } 2565 if (is_dst_valid) { 2566 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2567 } 2568 } 2569 2570 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2571 switch (bt) { 2572 case T_BYTE: pextrb(dst, src, idx); break; 2573 case T_SHORT: pextrw(dst, src, idx); break; 2574 case T_INT: pextrd(dst, src, idx); break; 2575 case T_LONG: pextrq(dst, src, idx); break; 2576 2577 default: 2578 assert(false,"Should not reach here."); 2579 break; 2580 } 2581 } 2582 2583 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2584 int esize = type2aelembytes(typ); 2585 int elem_per_lane = 16/esize; 2586 int lane = elemindex / elem_per_lane; 2587 int eindex = elemindex % elem_per_lane; 2588 2589 if (lane >= 2) { 2590 assert(UseAVX > 2, "required"); 2591 vextractf32x4(dst, src, lane & 3); 2592 return dst; 2593 } else if (lane > 0) { 2594 assert(UseAVX > 0, "required"); 2595 vextractf128(dst, src, lane); 2596 return dst; 2597 } else { 2598 return src; 2599 } 2600 } 2601 2602 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2603 if (typ == T_BYTE) { 2604 movsbl(dst, dst); 2605 } else if (typ == T_SHORT) { 2606 movswl(dst, dst); 2607 } 2608 } 2609 2610 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2611 int esize = type2aelembytes(typ); 2612 int elem_per_lane = 16/esize; 2613 int eindex = elemindex % elem_per_lane; 2614 assert(is_integral_type(typ),"required"); 2615 2616 if (eindex == 0) { 2617 if (typ == T_LONG) { 2618 movq(dst, src); 2619 } else { 2620 movdl(dst, src); 2621 movsxl(typ, dst); 2622 } 2623 } else { 2624 extract(typ, dst, src, eindex); 2625 movsxl(typ, dst); 2626 } 2627 } 2628 2629 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2630 int esize = type2aelembytes(typ); 2631 int elem_per_lane = 16/esize; 2632 int eindex = elemindex % elem_per_lane; 2633 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2634 2635 if (eindex == 0) { 2636 movq(dst, src); 2637 } else { 2638 if (typ == T_FLOAT) { 2639 if (UseAVX == 0) { 2640 movdqu(dst, src); 2641 shufps(dst, dst, eindex); 2642 } else { 2643 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2644 } 2645 } else { 2646 if (UseAVX == 0) { 2647 movdqu(dst, src); 2648 psrldq(dst, eindex*esize); 2649 } else { 2650 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2651 } 2652 movq(dst, dst); 2653 } 2654 } 2655 // Zero upper bits 2656 if (typ == T_FLOAT) { 2657 if (UseAVX == 0) { 2658 assert(vtmp != xnoreg, "required."); 2659 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2660 pand(dst, vtmp); 2661 } else { 2662 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2663 } 2664 } 2665 } 2666 2667 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2668 switch(typ) { 2669 case T_BYTE: 2670 case T_BOOLEAN: 2671 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2672 break; 2673 case T_SHORT: 2674 case T_CHAR: 2675 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2676 break; 2677 case T_INT: 2678 case T_FLOAT: 2679 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2680 break; 2681 case T_LONG: 2682 case T_DOUBLE: 2683 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2684 break; 2685 default: 2686 assert(false,"Should not reach here."); 2687 break; 2688 } 2689 } 2690 2691 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2692 assert(rscratch != noreg || always_reachable(src2), "missing"); 2693 2694 switch(typ) { 2695 case T_BOOLEAN: 2696 case T_BYTE: 2697 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2698 break; 2699 case T_CHAR: 2700 case T_SHORT: 2701 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2702 break; 2703 case T_INT: 2704 case T_FLOAT: 2705 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2706 break; 2707 case T_LONG: 2708 case T_DOUBLE: 2709 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2710 break; 2711 default: 2712 assert(false,"Should not reach here."); 2713 break; 2714 } 2715 } 2716 2717 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2718 switch(typ) { 2719 case T_BYTE: 2720 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2721 break; 2722 case T_SHORT: 2723 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2724 break; 2725 case T_INT: 2726 case T_FLOAT: 2727 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2728 break; 2729 case T_LONG: 2730 case T_DOUBLE: 2731 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2732 break; 2733 default: 2734 assert(false,"Should not reach here."); 2735 break; 2736 } 2737 } 2738 2739 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2740 assert(vlen_in_bytes <= 32, ""); 2741 int esize = type2aelembytes(bt); 2742 if (vlen_in_bytes == 32) { 2743 assert(vtmp == xnoreg, "required."); 2744 if (esize >= 4) { 2745 vtestps(src1, src2, AVX_256bit); 2746 } else { 2747 vptest(src1, src2, AVX_256bit); 2748 } 2749 return; 2750 } 2751 if (vlen_in_bytes < 16) { 2752 // Duplicate the lower part to fill the whole register, 2753 // Don't need to do so for src2 2754 assert(vtmp != xnoreg, "required"); 2755 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2756 pshufd(vtmp, src1, shuffle_imm); 2757 } else { 2758 assert(vtmp == xnoreg, "required"); 2759 vtmp = src1; 2760 } 2761 if (esize >= 4 && VM_Version::supports_avx()) { 2762 vtestps(vtmp, src2, AVX_128bit); 2763 } else { 2764 ptest(vtmp, src2); 2765 } 2766 } 2767 2768 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2769 assert(UseAVX >= 2, "required"); 2770 #ifdef ASSERT 2771 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2772 bool is_bw_supported = VM_Version::supports_avx512bw(); 2773 if (is_bw && !is_bw_supported) { 2774 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2775 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2776 "XMM register should be 0-15"); 2777 } 2778 #endif // ASSERT 2779 switch (elem_bt) { 2780 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2781 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2782 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2783 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2784 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2785 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2786 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2787 } 2788 } 2789 2790 #ifdef _LP64 2791 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2792 assert(UseAVX >= 2, "required"); 2793 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2794 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2795 if ((UseAVX > 2) && 2796 (!is_bw || VM_Version::supports_avx512bw()) && 2797 (!is_vl || VM_Version::supports_avx512vl())) { 2798 switch (elem_bt) { 2799 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2800 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2801 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2802 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2803 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2804 } 2805 } else { 2806 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2807 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2808 switch (elem_bt) { 2809 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2810 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2811 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2812 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2813 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2814 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2815 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2816 } 2817 } 2818 } 2819 #endif 2820 2821 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2822 switch (to_elem_bt) { 2823 case T_SHORT: 2824 vpmovsxbw(dst, src, vlen_enc); 2825 break; 2826 case T_INT: 2827 vpmovsxbd(dst, src, vlen_enc); 2828 break; 2829 case T_FLOAT: 2830 vpmovsxbd(dst, src, vlen_enc); 2831 vcvtdq2ps(dst, dst, vlen_enc); 2832 break; 2833 case T_LONG: 2834 vpmovsxbq(dst, src, vlen_enc); 2835 break; 2836 case T_DOUBLE: { 2837 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2838 vpmovsxbd(dst, src, mid_vlen_enc); 2839 vcvtdq2pd(dst, dst, vlen_enc); 2840 break; 2841 } 2842 default: 2843 fatal("Unsupported type %s", type2name(to_elem_bt)); 2844 break; 2845 } 2846 } 2847 2848 //------------------------------------------------------------------------------------------- 2849 2850 // IndexOf for constant substrings with size >= 8 chars 2851 // which don't need to be loaded through stack. 2852 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2853 Register cnt1, Register cnt2, 2854 int int_cnt2, Register result, 2855 XMMRegister vec, Register tmp, 2856 int ae) { 2857 ShortBranchVerifier sbv(this); 2858 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2859 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2860 2861 // This method uses the pcmpestri instruction with bound registers 2862 // inputs: 2863 // xmm - substring 2864 // rax - substring length (elements count) 2865 // mem - scanned string 2866 // rdx - string length (elements count) 2867 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2868 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2869 // outputs: 2870 // rcx - matched index in string 2871 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2872 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2873 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2874 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2875 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2876 2877 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2878 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2879 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2880 2881 // Note, inline_string_indexOf() generates checks: 2882 // if (substr.count > string.count) return -1; 2883 // if (substr.count == 0) return 0; 2884 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2885 2886 // Load substring. 2887 if (ae == StrIntrinsicNode::UL) { 2888 pmovzxbw(vec, Address(str2, 0)); 2889 } else { 2890 movdqu(vec, Address(str2, 0)); 2891 } 2892 movl(cnt2, int_cnt2); 2893 movptr(result, str1); // string addr 2894 2895 if (int_cnt2 > stride) { 2896 jmpb(SCAN_TO_SUBSTR); 2897 2898 // Reload substr for rescan, this code 2899 // is executed only for large substrings (> 8 chars) 2900 bind(RELOAD_SUBSTR); 2901 if (ae == StrIntrinsicNode::UL) { 2902 pmovzxbw(vec, Address(str2, 0)); 2903 } else { 2904 movdqu(vec, Address(str2, 0)); 2905 } 2906 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2907 2908 bind(RELOAD_STR); 2909 // We came here after the beginning of the substring was 2910 // matched but the rest of it was not so we need to search 2911 // again. Start from the next element after the previous match. 2912 2913 // cnt2 is number of substring reminding elements and 2914 // cnt1 is number of string reminding elements when cmp failed. 2915 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2916 subl(cnt1, cnt2); 2917 addl(cnt1, int_cnt2); 2918 movl(cnt2, int_cnt2); // Now restore cnt2 2919 2920 decrementl(cnt1); // Shift to next element 2921 cmpl(cnt1, cnt2); 2922 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2923 2924 addptr(result, (1<<scale1)); 2925 2926 } // (int_cnt2 > 8) 2927 2928 // Scan string for start of substr in 16-byte vectors 2929 bind(SCAN_TO_SUBSTR); 2930 pcmpestri(vec, Address(result, 0), mode); 2931 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2932 subl(cnt1, stride); 2933 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2934 cmpl(cnt1, cnt2); 2935 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2936 addptr(result, 16); 2937 jmpb(SCAN_TO_SUBSTR); 2938 2939 // Found a potential substr 2940 bind(FOUND_CANDIDATE); 2941 // Matched whole vector if first element matched (tmp(rcx) == 0). 2942 if (int_cnt2 == stride) { 2943 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2944 } else { // int_cnt2 > 8 2945 jccb(Assembler::overflow, FOUND_SUBSTR); 2946 } 2947 // After pcmpestri tmp(rcx) contains matched element index 2948 // Compute start addr of substr 2949 lea(result, Address(result, tmp, scale1)); 2950 2951 // Make sure string is still long enough 2952 subl(cnt1, tmp); 2953 cmpl(cnt1, cnt2); 2954 if (int_cnt2 == stride) { 2955 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2956 } else { // int_cnt2 > 8 2957 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2958 } 2959 // Left less then substring. 2960 2961 bind(RET_NOT_FOUND); 2962 movl(result, -1); 2963 jmp(EXIT); 2964 2965 if (int_cnt2 > stride) { 2966 // This code is optimized for the case when whole substring 2967 // is matched if its head is matched. 2968 bind(MATCH_SUBSTR_HEAD); 2969 pcmpestri(vec, Address(result, 0), mode); 2970 // Reload only string if does not match 2971 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2972 2973 Label CONT_SCAN_SUBSTR; 2974 // Compare the rest of substring (> 8 chars). 2975 bind(FOUND_SUBSTR); 2976 // First 8 chars are already matched. 2977 negptr(cnt2); 2978 addptr(cnt2, stride); 2979 2980 bind(SCAN_SUBSTR); 2981 subl(cnt1, stride); 2982 cmpl(cnt2, -stride); // Do not read beyond substring 2983 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2984 // Back-up strings to avoid reading beyond substring: 2985 // cnt1 = cnt1 - cnt2 + 8 2986 addl(cnt1, cnt2); // cnt2 is negative 2987 addl(cnt1, stride); 2988 movl(cnt2, stride); negptr(cnt2); 2989 bind(CONT_SCAN_SUBSTR); 2990 if (int_cnt2 < (int)G) { 2991 int tail_off1 = int_cnt2<<scale1; 2992 int tail_off2 = int_cnt2<<scale2; 2993 if (ae == StrIntrinsicNode::UL) { 2994 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2995 } else { 2996 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2997 } 2998 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2999 } else { 3000 // calculate index in register to avoid integer overflow (int_cnt2*2) 3001 movl(tmp, int_cnt2); 3002 addptr(tmp, cnt2); 3003 if (ae == StrIntrinsicNode::UL) { 3004 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 3005 } else { 3006 movdqu(vec, Address(str2, tmp, scale2, 0)); 3007 } 3008 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 3009 } 3010 // Need to reload strings pointers if not matched whole vector 3011 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3012 addptr(cnt2, stride); 3013 jcc(Assembler::negative, SCAN_SUBSTR); 3014 // Fall through if found full substring 3015 3016 } // (int_cnt2 > 8) 3017 3018 bind(RET_FOUND); 3019 // Found result if we matched full small substring. 3020 // Compute substr offset 3021 subptr(result, str1); 3022 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3023 shrl(result, 1); // index 3024 } 3025 bind(EXIT); 3026 3027 } // string_indexofC8 3028 3029 // Small strings are loaded through stack if they cross page boundary. 3030 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 3031 Register cnt1, Register cnt2, 3032 int int_cnt2, Register result, 3033 XMMRegister vec, Register tmp, 3034 int ae) { 3035 ShortBranchVerifier sbv(this); 3036 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3037 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3038 3039 // 3040 // int_cnt2 is length of small (< 8 chars) constant substring 3041 // or (-1) for non constant substring in which case its length 3042 // is in cnt2 register. 3043 // 3044 // Note, inline_string_indexOf() generates checks: 3045 // if (substr.count > string.count) return -1; 3046 // if (substr.count == 0) return 0; 3047 // 3048 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 3049 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 3050 // This method uses the pcmpestri instruction with bound registers 3051 // inputs: 3052 // xmm - substring 3053 // rax - substring length (elements count) 3054 // mem - scanned string 3055 // rdx - string length (elements count) 3056 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 3057 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 3058 // outputs: 3059 // rcx - matched index in string 3060 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3061 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 3062 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 3063 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 3064 3065 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 3066 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 3067 FOUND_CANDIDATE; 3068 3069 { //======================================================== 3070 // We don't know where these strings are located 3071 // and we can't read beyond them. Load them through stack. 3072 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 3073 3074 movptr(tmp, rsp); // save old SP 3075 3076 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 3077 if (int_cnt2 == (1>>scale2)) { // One byte 3078 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3079 load_unsigned_byte(result, Address(str2, 0)); 3080 movdl(vec, result); // move 32 bits 3081 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3082 // Not enough header space in 32-bit VM: 12+3 = 15. 3083 movl(result, Address(str2, -1)); 3084 shrl(result, 8); 3085 movdl(vec, result); // move 32 bits 3086 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3087 load_unsigned_short(result, Address(str2, 0)); 3088 movdl(vec, result); // move 32 bits 3089 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3090 movdl(vec, Address(str2, 0)); // move 32 bits 3091 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3092 movq(vec, Address(str2, 0)); // move 64 bits 3093 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3094 // Array header size is 12 bytes in 32-bit VM 3095 // + 6 bytes for 3 chars == 18 bytes, 3096 // enough space to load vec and shift. 3097 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3098 if (ae == StrIntrinsicNode::UL) { 3099 int tail_off = int_cnt2-8; 3100 pmovzxbw(vec, Address(str2, tail_off)); 3101 psrldq(vec, -2*tail_off); 3102 } 3103 else { 3104 int tail_off = int_cnt2*(1<<scale2); 3105 movdqu(vec, Address(str2, tail_off-16)); 3106 psrldq(vec, 16-tail_off); 3107 } 3108 } 3109 } else { // not constant substring 3110 cmpl(cnt2, stride); 3111 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3112 3113 // We can read beyond string if srt+16 does not cross page boundary 3114 // since heaps are aligned and mapped by pages. 3115 assert(os::vm_page_size() < (int)G, "default page should be small"); 3116 movl(result, str2); // We need only low 32 bits 3117 andl(result, ((int)os::vm_page_size()-1)); 3118 cmpl(result, ((int)os::vm_page_size()-16)); 3119 jccb(Assembler::belowEqual, CHECK_STR); 3120 3121 // Move small strings to stack to allow load 16 bytes into vec. 3122 subptr(rsp, 16); 3123 int stk_offset = wordSize-(1<<scale2); 3124 push(cnt2); 3125 3126 bind(COPY_SUBSTR); 3127 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3128 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3129 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3130 } else if (ae == StrIntrinsicNode::UU) { 3131 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3132 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3133 } 3134 decrement(cnt2); 3135 jccb(Assembler::notZero, COPY_SUBSTR); 3136 3137 pop(cnt2); 3138 movptr(str2, rsp); // New substring address 3139 } // non constant 3140 3141 bind(CHECK_STR); 3142 cmpl(cnt1, stride); 3143 jccb(Assembler::aboveEqual, BIG_STRINGS); 3144 3145 // Check cross page boundary. 3146 movl(result, str1); // We need only low 32 bits 3147 andl(result, ((int)os::vm_page_size()-1)); 3148 cmpl(result, ((int)os::vm_page_size()-16)); 3149 jccb(Assembler::belowEqual, BIG_STRINGS); 3150 3151 subptr(rsp, 16); 3152 int stk_offset = -(1<<scale1); 3153 if (int_cnt2 < 0) { // not constant 3154 push(cnt2); 3155 stk_offset += wordSize; 3156 } 3157 movl(cnt2, cnt1); 3158 3159 bind(COPY_STR); 3160 if (ae == StrIntrinsicNode::LL) { 3161 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3162 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3163 } else { 3164 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3165 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3166 } 3167 decrement(cnt2); 3168 jccb(Assembler::notZero, COPY_STR); 3169 3170 if (int_cnt2 < 0) { // not constant 3171 pop(cnt2); 3172 } 3173 movptr(str1, rsp); // New string address 3174 3175 bind(BIG_STRINGS); 3176 // Load substring. 3177 if (int_cnt2 < 0) { // -1 3178 if (ae == StrIntrinsicNode::UL) { 3179 pmovzxbw(vec, Address(str2, 0)); 3180 } else { 3181 movdqu(vec, Address(str2, 0)); 3182 } 3183 push(cnt2); // substr count 3184 push(str2); // substr addr 3185 push(str1); // string addr 3186 } else { 3187 // Small (< 8 chars) constant substrings are loaded already. 3188 movl(cnt2, int_cnt2); 3189 } 3190 push(tmp); // original SP 3191 3192 } // Finished loading 3193 3194 //======================================================== 3195 // Start search 3196 // 3197 3198 movptr(result, str1); // string addr 3199 3200 if (int_cnt2 < 0) { // Only for non constant substring 3201 jmpb(SCAN_TO_SUBSTR); 3202 3203 // SP saved at sp+0 3204 // String saved at sp+1*wordSize 3205 // Substr saved at sp+2*wordSize 3206 // Substr count saved at sp+3*wordSize 3207 3208 // Reload substr for rescan, this code 3209 // is executed only for large substrings (> 8 chars) 3210 bind(RELOAD_SUBSTR); 3211 movptr(str2, Address(rsp, 2*wordSize)); 3212 movl(cnt2, Address(rsp, 3*wordSize)); 3213 if (ae == StrIntrinsicNode::UL) { 3214 pmovzxbw(vec, Address(str2, 0)); 3215 } else { 3216 movdqu(vec, Address(str2, 0)); 3217 } 3218 // We came here after the beginning of the substring was 3219 // matched but the rest of it was not so we need to search 3220 // again. Start from the next element after the previous match. 3221 subptr(str1, result); // Restore counter 3222 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3223 shrl(str1, 1); 3224 } 3225 addl(cnt1, str1); 3226 decrementl(cnt1); // Shift to next element 3227 cmpl(cnt1, cnt2); 3228 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3229 3230 addptr(result, (1<<scale1)); 3231 } // non constant 3232 3233 // Scan string for start of substr in 16-byte vectors 3234 bind(SCAN_TO_SUBSTR); 3235 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3236 pcmpestri(vec, Address(result, 0), mode); 3237 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3238 subl(cnt1, stride); 3239 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3240 cmpl(cnt1, cnt2); 3241 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3242 addptr(result, 16); 3243 3244 bind(ADJUST_STR); 3245 cmpl(cnt1, stride); // Do not read beyond string 3246 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3247 // Back-up string to avoid reading beyond string. 3248 lea(result, Address(result, cnt1, scale1, -16)); 3249 movl(cnt1, stride); 3250 jmpb(SCAN_TO_SUBSTR); 3251 3252 // Found a potential substr 3253 bind(FOUND_CANDIDATE); 3254 // After pcmpestri tmp(rcx) contains matched element index 3255 3256 // Make sure string is still long enough 3257 subl(cnt1, tmp); 3258 cmpl(cnt1, cnt2); 3259 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3260 // Left less then substring. 3261 3262 bind(RET_NOT_FOUND); 3263 movl(result, -1); 3264 jmp(CLEANUP); 3265 3266 bind(FOUND_SUBSTR); 3267 // Compute start addr of substr 3268 lea(result, Address(result, tmp, scale1)); 3269 if (int_cnt2 > 0) { // Constant substring 3270 // Repeat search for small substring (< 8 chars) 3271 // from new point without reloading substring. 3272 // Have to check that we don't read beyond string. 3273 cmpl(tmp, stride-int_cnt2); 3274 jccb(Assembler::greater, ADJUST_STR); 3275 // Fall through if matched whole substring. 3276 } else { // non constant 3277 assert(int_cnt2 == -1, "should be != 0"); 3278 3279 addl(tmp, cnt2); 3280 // Found result if we matched whole substring. 3281 cmpl(tmp, stride); 3282 jcc(Assembler::lessEqual, RET_FOUND); 3283 3284 // Repeat search for small substring (<= 8 chars) 3285 // from new point 'str1' without reloading substring. 3286 cmpl(cnt2, stride); 3287 // Have to check that we don't read beyond string. 3288 jccb(Assembler::lessEqual, ADJUST_STR); 3289 3290 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3291 // Compare the rest of substring (> 8 chars). 3292 movptr(str1, result); 3293 3294 cmpl(tmp, cnt2); 3295 // First 8 chars are already matched. 3296 jccb(Assembler::equal, CHECK_NEXT); 3297 3298 bind(SCAN_SUBSTR); 3299 pcmpestri(vec, Address(str1, 0), mode); 3300 // Need to reload strings pointers if not matched whole vector 3301 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3302 3303 bind(CHECK_NEXT); 3304 subl(cnt2, stride); 3305 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3306 addptr(str1, 16); 3307 if (ae == StrIntrinsicNode::UL) { 3308 addptr(str2, 8); 3309 } else { 3310 addptr(str2, 16); 3311 } 3312 subl(cnt1, stride); 3313 cmpl(cnt2, stride); // Do not read beyond substring 3314 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3315 // Back-up strings to avoid reading beyond substring. 3316 3317 if (ae == StrIntrinsicNode::UL) { 3318 lea(str2, Address(str2, cnt2, scale2, -8)); 3319 lea(str1, Address(str1, cnt2, scale1, -16)); 3320 } else { 3321 lea(str2, Address(str2, cnt2, scale2, -16)); 3322 lea(str1, Address(str1, cnt2, scale1, -16)); 3323 } 3324 subl(cnt1, cnt2); 3325 movl(cnt2, stride); 3326 addl(cnt1, stride); 3327 bind(CONT_SCAN_SUBSTR); 3328 if (ae == StrIntrinsicNode::UL) { 3329 pmovzxbw(vec, Address(str2, 0)); 3330 } else { 3331 movdqu(vec, Address(str2, 0)); 3332 } 3333 jmp(SCAN_SUBSTR); 3334 3335 bind(RET_FOUND_LONG); 3336 movptr(str1, Address(rsp, wordSize)); 3337 } // non constant 3338 3339 bind(RET_FOUND); 3340 // Compute substr offset 3341 subptr(result, str1); 3342 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3343 shrl(result, 1); // index 3344 } 3345 bind(CLEANUP); 3346 pop(rsp); // restore SP 3347 3348 } // string_indexof 3349 3350 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3351 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3352 ShortBranchVerifier sbv(this); 3353 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3354 3355 int stride = 8; 3356 3357 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3358 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3359 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3360 FOUND_SEQ_CHAR, DONE_LABEL; 3361 3362 movptr(result, str1); 3363 if (UseAVX >= 2) { 3364 cmpl(cnt1, stride); 3365 jcc(Assembler::less, SCAN_TO_CHAR); 3366 cmpl(cnt1, 2*stride); 3367 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3368 movdl(vec1, ch); 3369 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3370 vpxor(vec2, vec2); 3371 movl(tmp, cnt1); 3372 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3373 andl(cnt1,0x0000000F); //tail count (in chars) 3374 3375 bind(SCAN_TO_16_CHAR_LOOP); 3376 vmovdqu(vec3, Address(result, 0)); 3377 vpcmpeqw(vec3, vec3, vec1, 1); 3378 vptest(vec2, vec3); 3379 jcc(Assembler::carryClear, FOUND_CHAR); 3380 addptr(result, 32); 3381 subl(tmp, 2*stride); 3382 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3383 jmp(SCAN_TO_8_CHAR); 3384 bind(SCAN_TO_8_CHAR_INIT); 3385 movdl(vec1, ch); 3386 pshuflw(vec1, vec1, 0x00); 3387 pshufd(vec1, vec1, 0); 3388 pxor(vec2, vec2); 3389 } 3390 bind(SCAN_TO_8_CHAR); 3391 cmpl(cnt1, stride); 3392 jcc(Assembler::less, SCAN_TO_CHAR); 3393 if (UseAVX < 2) { 3394 movdl(vec1, ch); 3395 pshuflw(vec1, vec1, 0x00); 3396 pshufd(vec1, vec1, 0); 3397 pxor(vec2, vec2); 3398 } 3399 movl(tmp, cnt1); 3400 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3401 andl(cnt1,0x00000007); //tail count (in chars) 3402 3403 bind(SCAN_TO_8_CHAR_LOOP); 3404 movdqu(vec3, Address(result, 0)); 3405 pcmpeqw(vec3, vec1); 3406 ptest(vec2, vec3); 3407 jcc(Assembler::carryClear, FOUND_CHAR); 3408 addptr(result, 16); 3409 subl(tmp, stride); 3410 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3411 bind(SCAN_TO_CHAR); 3412 testl(cnt1, cnt1); 3413 jcc(Assembler::zero, RET_NOT_FOUND); 3414 bind(SCAN_TO_CHAR_LOOP); 3415 load_unsigned_short(tmp, Address(result, 0)); 3416 cmpl(ch, tmp); 3417 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3418 addptr(result, 2); 3419 subl(cnt1, 1); 3420 jccb(Assembler::zero, RET_NOT_FOUND); 3421 jmp(SCAN_TO_CHAR_LOOP); 3422 3423 bind(RET_NOT_FOUND); 3424 movl(result, -1); 3425 jmpb(DONE_LABEL); 3426 3427 bind(FOUND_CHAR); 3428 if (UseAVX >= 2) { 3429 vpmovmskb(tmp, vec3); 3430 } else { 3431 pmovmskb(tmp, vec3); 3432 } 3433 bsfl(ch, tmp); 3434 addptr(result, ch); 3435 3436 bind(FOUND_SEQ_CHAR); 3437 subptr(result, str1); 3438 shrl(result, 1); 3439 3440 bind(DONE_LABEL); 3441 } // string_indexof_char 3442 3443 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3444 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3445 ShortBranchVerifier sbv(this); 3446 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3447 3448 int stride = 16; 3449 3450 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3451 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3452 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3453 FOUND_SEQ_CHAR, DONE_LABEL; 3454 3455 movptr(result, str1); 3456 if (UseAVX >= 2) { 3457 cmpl(cnt1, stride); 3458 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3459 cmpl(cnt1, stride*2); 3460 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3461 movdl(vec1, ch); 3462 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3463 vpxor(vec2, vec2); 3464 movl(tmp, cnt1); 3465 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3466 andl(cnt1,0x0000001F); //tail count (in chars) 3467 3468 bind(SCAN_TO_32_CHAR_LOOP); 3469 vmovdqu(vec3, Address(result, 0)); 3470 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3471 vptest(vec2, vec3); 3472 jcc(Assembler::carryClear, FOUND_CHAR); 3473 addptr(result, 32); 3474 subl(tmp, stride*2); 3475 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3476 jmp(SCAN_TO_16_CHAR); 3477 3478 bind(SCAN_TO_16_CHAR_INIT); 3479 movdl(vec1, ch); 3480 pxor(vec2, vec2); 3481 pshufb(vec1, vec2); 3482 } 3483 3484 bind(SCAN_TO_16_CHAR); 3485 cmpl(cnt1, stride); 3486 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3487 if (UseAVX < 2) { 3488 movdl(vec1, ch); 3489 pxor(vec2, vec2); 3490 pshufb(vec1, vec2); 3491 } 3492 movl(tmp, cnt1); 3493 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3494 andl(cnt1,0x0000000F); //tail count (in bytes) 3495 3496 bind(SCAN_TO_16_CHAR_LOOP); 3497 movdqu(vec3, Address(result, 0)); 3498 pcmpeqb(vec3, vec1); 3499 ptest(vec2, vec3); 3500 jcc(Assembler::carryClear, FOUND_CHAR); 3501 addptr(result, 16); 3502 subl(tmp, stride); 3503 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3504 3505 bind(SCAN_TO_CHAR_INIT); 3506 testl(cnt1, cnt1); 3507 jcc(Assembler::zero, RET_NOT_FOUND); 3508 bind(SCAN_TO_CHAR_LOOP); 3509 load_unsigned_byte(tmp, Address(result, 0)); 3510 cmpl(ch, tmp); 3511 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3512 addptr(result, 1); 3513 subl(cnt1, 1); 3514 jccb(Assembler::zero, RET_NOT_FOUND); 3515 jmp(SCAN_TO_CHAR_LOOP); 3516 3517 bind(RET_NOT_FOUND); 3518 movl(result, -1); 3519 jmpb(DONE_LABEL); 3520 3521 bind(FOUND_CHAR); 3522 if (UseAVX >= 2) { 3523 vpmovmskb(tmp, vec3); 3524 } else { 3525 pmovmskb(tmp, vec3); 3526 } 3527 bsfl(ch, tmp); 3528 addptr(result, ch); 3529 3530 bind(FOUND_SEQ_CHAR); 3531 subptr(result, str1); 3532 3533 bind(DONE_LABEL); 3534 } // stringL_indexof_char 3535 3536 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3537 switch (eltype) { 3538 case T_BOOLEAN: return sizeof(jboolean); 3539 case T_BYTE: return sizeof(jbyte); 3540 case T_SHORT: return sizeof(jshort); 3541 case T_CHAR: return sizeof(jchar); 3542 case T_INT: return sizeof(jint); 3543 default: 3544 ShouldNotReachHere(); 3545 return -1; 3546 } 3547 } 3548 3549 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3550 switch (eltype) { 3551 // T_BOOLEAN used as surrogate for unsigned byte 3552 case T_BOOLEAN: movzbl(dst, src); break; 3553 case T_BYTE: movsbl(dst, src); break; 3554 case T_SHORT: movswl(dst, src); break; 3555 case T_CHAR: movzwl(dst, src); break; 3556 case T_INT: movl(dst, src); break; 3557 default: 3558 ShouldNotReachHere(); 3559 } 3560 } 3561 3562 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3563 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3564 } 3565 3566 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3567 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3568 } 3569 3570 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3571 const int vlen = Assembler::AVX_256bit; 3572 switch (eltype) { 3573 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3574 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3575 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3576 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3577 case T_INT: 3578 // do nothing 3579 break; 3580 default: 3581 ShouldNotReachHere(); 3582 } 3583 } 3584 3585 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3586 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3587 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3588 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3589 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3590 BasicType eltype) { 3591 ShortBranchVerifier sbv(this); 3592 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3593 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3594 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3595 3596 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3597 SHORT_UNROLLED_LOOP_EXIT, 3598 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3599 UNROLLED_VECTOR_LOOP_BEGIN, 3600 END; 3601 switch (eltype) { 3602 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3603 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3604 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3605 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3606 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3607 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3608 } 3609 3610 // For "renaming" for readibility of the code 3611 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3612 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3613 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3614 3615 const int elsize = arrays_hashcode_elsize(eltype); 3616 3617 /* 3618 if (cnt1 >= 2) { 3619 if (cnt1 >= 32) { 3620 UNROLLED VECTOR LOOP 3621 } 3622 UNROLLED SCALAR LOOP 3623 } 3624 SINGLE SCALAR 3625 */ 3626 3627 cmpl(cnt1, 32); 3628 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3629 3630 // cnt1 >= 32 && generate_vectorized_loop 3631 xorl(index, index); 3632 3633 // vresult = IntVector.zero(I256); 3634 for (int idx = 0; idx < 4; idx++) { 3635 vpxor(vresult[idx], vresult[idx]); 3636 } 3637 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3638 Register bound = tmp2; 3639 Register next = tmp3; 3640 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3641 movl(next, Address(tmp2, 0)); 3642 movdl(vnext, next); 3643 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3644 3645 // index = 0; 3646 // bound = cnt1 & ~(32 - 1); 3647 movl(bound, cnt1); 3648 andl(bound, ~(32 - 1)); 3649 // for (; index < bound; index += 32) { 3650 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3651 // result *= next; 3652 imull(result, next); 3653 // loop fission to upfront the cost of fetching from memory, OOO execution 3654 // can then hopefully do a better job of prefetching 3655 for (int idx = 0; idx < 4; idx++) { 3656 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3657 } 3658 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3659 for (int idx = 0; idx < 4; idx++) { 3660 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3661 arrays_hashcode_elvcast(vtmp[idx], eltype); 3662 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3663 } 3664 // index += 32; 3665 addl(index, 32); 3666 // index < bound; 3667 cmpl(index, bound); 3668 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3669 // } 3670 3671 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3672 subl(cnt1, bound); 3673 // release bound 3674 3675 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3676 for (int idx = 0; idx < 4; idx++) { 3677 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3678 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3679 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3680 } 3681 // result += vresult.reduceLanes(ADD); 3682 for (int idx = 0; idx < 4; idx++) { 3683 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3684 } 3685 3686 // } else if (cnt1 < 32) { 3687 3688 bind(SHORT_UNROLLED_BEGIN); 3689 // int i = 1; 3690 movl(index, 1); 3691 cmpl(index, cnt1); 3692 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3693 3694 // for (; i < cnt1 ; i += 2) { 3695 bind(SHORT_UNROLLED_LOOP_BEGIN); 3696 movl(tmp3, 961); 3697 imull(result, tmp3); 3698 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3699 movl(tmp3, tmp2); 3700 shll(tmp3, 5); 3701 subl(tmp3, tmp2); 3702 addl(result, tmp3); 3703 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3704 addl(result, tmp3); 3705 addl(index, 2); 3706 cmpl(index, cnt1); 3707 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3708 3709 // } 3710 // if (i >= cnt1) { 3711 bind(SHORT_UNROLLED_LOOP_EXIT); 3712 jccb(Assembler::greater, END); 3713 movl(tmp2, result); 3714 shll(result, 5); 3715 subl(result, tmp2); 3716 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3717 addl(result, tmp3); 3718 // } 3719 bind(END); 3720 3721 BLOCK_COMMENT("} // arrays_hashcode"); 3722 3723 } // arrays_hashcode 3724 3725 // helper function for string_compare 3726 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3727 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3728 Address::ScaleFactor scale2, Register index, int ae) { 3729 if (ae == StrIntrinsicNode::LL) { 3730 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3731 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3732 } else if (ae == StrIntrinsicNode::UU) { 3733 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3734 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3735 } else { 3736 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3737 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3738 } 3739 } 3740 3741 // Compare strings, used for char[] and byte[]. 3742 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3743 Register cnt1, Register cnt2, Register result, 3744 XMMRegister vec1, int ae, KRegister mask) { 3745 ShortBranchVerifier sbv(this); 3746 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3747 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3748 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3749 int stride2x2 = 0x40; 3750 Address::ScaleFactor scale = Address::no_scale; 3751 Address::ScaleFactor scale1 = Address::no_scale; 3752 Address::ScaleFactor scale2 = Address::no_scale; 3753 3754 if (ae != StrIntrinsicNode::LL) { 3755 stride2x2 = 0x20; 3756 } 3757 3758 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3759 shrl(cnt2, 1); 3760 } 3761 // Compute the minimum of the string lengths and the 3762 // difference of the string lengths (stack). 3763 // Do the conditional move stuff 3764 movl(result, cnt1); 3765 subl(cnt1, cnt2); 3766 push(cnt1); 3767 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3768 3769 // Is the minimum length zero? 3770 testl(cnt2, cnt2); 3771 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3772 if (ae == StrIntrinsicNode::LL) { 3773 // Load first bytes 3774 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3775 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3776 } else if (ae == StrIntrinsicNode::UU) { 3777 // Load first characters 3778 load_unsigned_short(result, Address(str1, 0)); 3779 load_unsigned_short(cnt1, Address(str2, 0)); 3780 } else { 3781 load_unsigned_byte(result, Address(str1, 0)); 3782 load_unsigned_short(cnt1, Address(str2, 0)); 3783 } 3784 subl(result, cnt1); 3785 jcc(Assembler::notZero, POP_LABEL); 3786 3787 if (ae == StrIntrinsicNode::UU) { 3788 // Divide length by 2 to get number of chars 3789 shrl(cnt2, 1); 3790 } 3791 cmpl(cnt2, 1); 3792 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3793 3794 // Check if the strings start at the same location and setup scale and stride 3795 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3796 cmpptr(str1, str2); 3797 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3798 if (ae == StrIntrinsicNode::LL) { 3799 scale = Address::times_1; 3800 stride = 16; 3801 } else { 3802 scale = Address::times_2; 3803 stride = 8; 3804 } 3805 } else { 3806 scale1 = Address::times_1; 3807 scale2 = Address::times_2; 3808 // scale not used 3809 stride = 8; 3810 } 3811 3812 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3813 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3814 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3815 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3816 Label COMPARE_TAIL_LONG; 3817 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3818 3819 int pcmpmask = 0x19; 3820 if (ae == StrIntrinsicNode::LL) { 3821 pcmpmask &= ~0x01; 3822 } 3823 3824 // Setup to compare 16-chars (32-bytes) vectors, 3825 // start from first character again because it has aligned address. 3826 if (ae == StrIntrinsicNode::LL) { 3827 stride2 = 32; 3828 } else { 3829 stride2 = 16; 3830 } 3831 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3832 adr_stride = stride << scale; 3833 } else { 3834 adr_stride1 = 8; //stride << scale1; 3835 adr_stride2 = 16; //stride << scale2; 3836 } 3837 3838 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3839 // rax and rdx are used by pcmpestri as elements counters 3840 movl(result, cnt2); 3841 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3842 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3843 3844 // fast path : compare first 2 8-char vectors. 3845 bind(COMPARE_16_CHARS); 3846 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3847 movdqu(vec1, Address(str1, 0)); 3848 } else { 3849 pmovzxbw(vec1, Address(str1, 0)); 3850 } 3851 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3852 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3853 3854 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3855 movdqu(vec1, Address(str1, adr_stride)); 3856 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3857 } else { 3858 pmovzxbw(vec1, Address(str1, adr_stride1)); 3859 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3860 } 3861 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3862 addl(cnt1, stride); 3863 3864 // Compare the characters at index in cnt1 3865 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3866 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3867 subl(result, cnt2); 3868 jmp(POP_LABEL); 3869 3870 // Setup the registers to start vector comparison loop 3871 bind(COMPARE_WIDE_VECTORS); 3872 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3873 lea(str1, Address(str1, result, scale)); 3874 lea(str2, Address(str2, result, scale)); 3875 } else { 3876 lea(str1, Address(str1, result, scale1)); 3877 lea(str2, Address(str2, result, scale2)); 3878 } 3879 subl(result, stride2); 3880 subl(cnt2, stride2); 3881 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3882 negptr(result); 3883 3884 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3885 bind(COMPARE_WIDE_VECTORS_LOOP); 3886 3887 #ifdef _LP64 3888 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3889 cmpl(cnt2, stride2x2); 3890 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3891 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3892 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3893 3894 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3895 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3896 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3897 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3898 } else { 3899 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3900 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3901 } 3902 kortestql(mask, mask); 3903 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3904 addptr(result, stride2x2); // update since we already compared at this addr 3905 subl(cnt2, stride2x2); // and sub the size too 3906 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3907 3908 vpxor(vec1, vec1); 3909 jmpb(COMPARE_WIDE_TAIL); 3910 }//if (VM_Version::supports_avx512vlbw()) 3911 #endif // _LP64 3912 3913 3914 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3915 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3916 vmovdqu(vec1, Address(str1, result, scale)); 3917 vpxor(vec1, Address(str2, result, scale)); 3918 } else { 3919 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3920 vpxor(vec1, Address(str2, result, scale2)); 3921 } 3922 vptest(vec1, vec1); 3923 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3924 addptr(result, stride2); 3925 subl(cnt2, stride2); 3926 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3927 // clean upper bits of YMM registers 3928 vpxor(vec1, vec1); 3929 3930 // compare wide vectors tail 3931 bind(COMPARE_WIDE_TAIL); 3932 testptr(result, result); 3933 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3934 3935 movl(result, stride2); 3936 movl(cnt2, result); 3937 negptr(result); 3938 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3939 3940 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3941 bind(VECTOR_NOT_EQUAL); 3942 // clean upper bits of YMM registers 3943 vpxor(vec1, vec1); 3944 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3945 lea(str1, Address(str1, result, scale)); 3946 lea(str2, Address(str2, result, scale)); 3947 } else { 3948 lea(str1, Address(str1, result, scale1)); 3949 lea(str2, Address(str2, result, scale2)); 3950 } 3951 jmp(COMPARE_16_CHARS); 3952 3953 // Compare tail chars, length between 1 to 15 chars 3954 bind(COMPARE_TAIL_LONG); 3955 movl(cnt2, result); 3956 cmpl(cnt2, stride); 3957 jcc(Assembler::less, COMPARE_SMALL_STR); 3958 3959 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3960 movdqu(vec1, Address(str1, 0)); 3961 } else { 3962 pmovzxbw(vec1, Address(str1, 0)); 3963 } 3964 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3965 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3966 subptr(cnt2, stride); 3967 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3968 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3969 lea(str1, Address(str1, result, scale)); 3970 lea(str2, Address(str2, result, scale)); 3971 } else { 3972 lea(str1, Address(str1, result, scale1)); 3973 lea(str2, Address(str2, result, scale2)); 3974 } 3975 negptr(cnt2); 3976 jmpb(WHILE_HEAD_LABEL); 3977 3978 bind(COMPARE_SMALL_STR); 3979 } else if (UseSSE42Intrinsics) { 3980 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3981 int pcmpmask = 0x19; 3982 // Setup to compare 8-char (16-byte) vectors, 3983 // start from first character again because it has aligned address. 3984 movl(result, cnt2); 3985 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3986 if (ae == StrIntrinsicNode::LL) { 3987 pcmpmask &= ~0x01; 3988 } 3989 jcc(Assembler::zero, COMPARE_TAIL); 3990 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3991 lea(str1, Address(str1, result, scale)); 3992 lea(str2, Address(str2, result, scale)); 3993 } else { 3994 lea(str1, Address(str1, result, scale1)); 3995 lea(str2, Address(str2, result, scale2)); 3996 } 3997 negptr(result); 3998 3999 // pcmpestri 4000 // inputs: 4001 // vec1- substring 4002 // rax - negative string length (elements count) 4003 // mem - scanned string 4004 // rdx - string length (elements count) 4005 // pcmpmask - cmp mode: 11000 (string compare with negated result) 4006 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 4007 // outputs: 4008 // rcx - first mismatched element index 4009 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 4010 4011 bind(COMPARE_WIDE_VECTORS); 4012 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4013 movdqu(vec1, Address(str1, result, scale)); 4014 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4015 } else { 4016 pmovzxbw(vec1, Address(str1, result, scale1)); 4017 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4018 } 4019 // After pcmpestri cnt1(rcx) contains mismatched element index 4020 4021 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 4022 addptr(result, stride); 4023 subptr(cnt2, stride); 4024 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4025 4026 // compare wide vectors tail 4027 testptr(result, result); 4028 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4029 4030 movl(cnt2, stride); 4031 movl(result, stride); 4032 negptr(result); 4033 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4034 movdqu(vec1, Address(str1, result, scale)); 4035 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4036 } else { 4037 pmovzxbw(vec1, Address(str1, result, scale1)); 4038 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4039 } 4040 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 4041 4042 // Mismatched characters in the vectors 4043 bind(VECTOR_NOT_EQUAL); 4044 addptr(cnt1, result); 4045 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 4046 subl(result, cnt2); 4047 jmpb(POP_LABEL); 4048 4049 bind(COMPARE_TAIL); // limit is zero 4050 movl(cnt2, result); 4051 // Fallthru to tail compare 4052 } 4053 // Shift str2 and str1 to the end of the arrays, negate min 4054 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4055 lea(str1, Address(str1, cnt2, scale)); 4056 lea(str2, Address(str2, cnt2, scale)); 4057 } else { 4058 lea(str1, Address(str1, cnt2, scale1)); 4059 lea(str2, Address(str2, cnt2, scale2)); 4060 } 4061 decrementl(cnt2); // first character was compared already 4062 negptr(cnt2); 4063 4064 // Compare the rest of the elements 4065 bind(WHILE_HEAD_LABEL); 4066 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 4067 subl(result, cnt1); 4068 jccb(Assembler::notZero, POP_LABEL); 4069 increment(cnt2); 4070 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 4071 4072 // Strings are equal up to min length. Return the length difference. 4073 bind(LENGTH_DIFF_LABEL); 4074 pop(result); 4075 if (ae == StrIntrinsicNode::UU) { 4076 // Divide diff by 2 to get number of chars 4077 sarl(result, 1); 4078 } 4079 jmpb(DONE_LABEL); 4080 4081 #ifdef _LP64 4082 if (VM_Version::supports_avx512vlbw()) { 4083 4084 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4085 4086 kmovql(cnt1, mask); 4087 notq(cnt1); 4088 bsfq(cnt2, cnt1); 4089 if (ae != StrIntrinsicNode::LL) { 4090 // Divide diff by 2 to get number of chars 4091 sarl(cnt2, 1); 4092 } 4093 addq(result, cnt2); 4094 if (ae == StrIntrinsicNode::LL) { 4095 load_unsigned_byte(cnt1, Address(str2, result)); 4096 load_unsigned_byte(result, Address(str1, result)); 4097 } else if (ae == StrIntrinsicNode::UU) { 4098 load_unsigned_short(cnt1, Address(str2, result, scale)); 4099 load_unsigned_short(result, Address(str1, result, scale)); 4100 } else { 4101 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4102 load_unsigned_byte(result, Address(str1, result, scale1)); 4103 } 4104 subl(result, cnt1); 4105 jmpb(POP_LABEL); 4106 }//if (VM_Version::supports_avx512vlbw()) 4107 #endif // _LP64 4108 4109 // Discard the stored length difference 4110 bind(POP_LABEL); 4111 pop(cnt1); 4112 4113 // That's it 4114 bind(DONE_LABEL); 4115 if(ae == StrIntrinsicNode::UL) { 4116 negl(result); 4117 } 4118 4119 } 4120 4121 // Search for Non-ASCII character (Negative byte value) in a byte array, 4122 // return the index of the first such character, otherwise the length 4123 // of the array segment searched. 4124 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4125 // @IntrinsicCandidate 4126 // public static int countPositives(byte[] ba, int off, int len) { 4127 // for (int i = off; i < off + len; i++) { 4128 // if (ba[i] < 0) { 4129 // return i - off; 4130 // } 4131 // } 4132 // return len; 4133 // } 4134 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4135 Register result, Register tmp1, 4136 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4137 // rsi: byte array 4138 // rcx: len 4139 // rax: result 4140 ShortBranchVerifier sbv(this); 4141 assert_different_registers(ary1, len, result, tmp1); 4142 assert_different_registers(vec1, vec2); 4143 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4144 4145 movl(result, len); // copy 4146 // len == 0 4147 testl(len, len); 4148 jcc(Assembler::zero, DONE); 4149 4150 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4151 VM_Version::supports_avx512vlbw() && 4152 VM_Version::supports_bmi2()) { 4153 4154 Label test_64_loop, test_tail, BREAK_LOOP; 4155 movl(tmp1, len); 4156 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4157 4158 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4159 andl(len, 0xffffffc0); // vector count (in chars) 4160 jccb(Assembler::zero, test_tail); 4161 4162 lea(ary1, Address(ary1, len, Address::times_1)); 4163 negptr(len); 4164 4165 bind(test_64_loop); 4166 // Check whether our 64 elements of size byte contain negatives 4167 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4168 kortestql(mask1, mask1); 4169 jcc(Assembler::notZero, BREAK_LOOP); 4170 4171 addptr(len, 64); 4172 jccb(Assembler::notZero, test_64_loop); 4173 4174 bind(test_tail); 4175 // bail out when there is nothing to be done 4176 testl(tmp1, -1); 4177 jcc(Assembler::zero, DONE); 4178 4179 4180 // check the tail for absense of negatives 4181 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4182 #ifdef _LP64 4183 { 4184 Register tmp3_aliased = len; 4185 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4186 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4187 notq(tmp3_aliased); 4188 kmovql(mask2, tmp3_aliased); 4189 } 4190 #else 4191 Label k_init; 4192 jmp(k_init); 4193 4194 // We could not read 64-bits from a general purpose register thus we move 4195 // data required to compose 64 1's to the instruction stream 4196 // We emit 64 byte wide series of elements from 0..63 which later on would 4197 // be used as a compare targets with tail count contained in tmp1 register. 4198 // Result would be a k register having tmp1 consecutive number or 1 4199 // counting from least significant bit. 4200 address tmp = pc(); 4201 emit_int64(0x0706050403020100); 4202 emit_int64(0x0F0E0D0C0B0A0908); 4203 emit_int64(0x1716151413121110); 4204 emit_int64(0x1F1E1D1C1B1A1918); 4205 emit_int64(0x2726252423222120); 4206 emit_int64(0x2F2E2D2C2B2A2928); 4207 emit_int64(0x3736353433323130); 4208 emit_int64(0x3F3E3D3C3B3A3938); 4209 4210 bind(k_init); 4211 lea(len, InternalAddress(tmp)); 4212 // create mask to test for negative byte inside a vector 4213 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 4214 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 4215 4216 #endif 4217 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4218 ktestq(mask1, mask2); 4219 jcc(Assembler::zero, DONE); 4220 4221 // do a full check for negative registers in the tail 4222 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4223 // ary1 already pointing to the right place 4224 jmpb(TAIL_START); 4225 4226 bind(BREAK_LOOP); 4227 // At least one byte in the last 64 byte block was negative. 4228 // Set up to look at the last 64 bytes as if they were a tail 4229 lea(ary1, Address(ary1, len, Address::times_1)); 4230 addptr(result, len); 4231 // Ignore the very last byte: if all others are positive, 4232 // it must be negative, so we can skip right to the 2+1 byte 4233 // end comparison at this point 4234 orl(result, 63); 4235 movl(len, 63); 4236 // Fallthru to tail compare 4237 } else { 4238 4239 if (UseAVX >= 2 && UseSSE >= 2) { 4240 // With AVX2, use 32-byte vector compare 4241 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4242 4243 // Compare 32-byte vectors 4244 testl(len, 0xffffffe0); // vector count (in bytes) 4245 jccb(Assembler::zero, TAIL_START); 4246 4247 andl(len, 0xffffffe0); 4248 lea(ary1, Address(ary1, len, Address::times_1)); 4249 negptr(len); 4250 4251 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4252 movdl(vec2, tmp1); 4253 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4254 4255 bind(COMPARE_WIDE_VECTORS); 4256 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4257 vptest(vec1, vec2); 4258 jccb(Assembler::notZero, BREAK_LOOP); 4259 addptr(len, 32); 4260 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4261 4262 testl(result, 0x0000001f); // any bytes remaining? 4263 jcc(Assembler::zero, DONE); 4264 4265 // Quick test using the already prepared vector mask 4266 movl(len, result); 4267 andl(len, 0x0000001f); 4268 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4269 vptest(vec1, vec2); 4270 jcc(Assembler::zero, DONE); 4271 // There are zeros, jump to the tail to determine exactly where 4272 jmpb(TAIL_START); 4273 4274 bind(BREAK_LOOP); 4275 // At least one byte in the last 32-byte vector is negative. 4276 // Set up to look at the last 32 bytes as if they were a tail 4277 lea(ary1, Address(ary1, len, Address::times_1)); 4278 addptr(result, len); 4279 // Ignore the very last byte: if all others are positive, 4280 // it must be negative, so we can skip right to the 2+1 byte 4281 // end comparison at this point 4282 orl(result, 31); 4283 movl(len, 31); 4284 // Fallthru to tail compare 4285 } else if (UseSSE42Intrinsics) { 4286 // With SSE4.2, use double quad vector compare 4287 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4288 4289 // Compare 16-byte vectors 4290 testl(len, 0xfffffff0); // vector count (in bytes) 4291 jcc(Assembler::zero, TAIL_START); 4292 4293 andl(len, 0xfffffff0); 4294 lea(ary1, Address(ary1, len, Address::times_1)); 4295 negptr(len); 4296 4297 movl(tmp1, 0x80808080); 4298 movdl(vec2, tmp1); 4299 pshufd(vec2, vec2, 0); 4300 4301 bind(COMPARE_WIDE_VECTORS); 4302 movdqu(vec1, Address(ary1, len, Address::times_1)); 4303 ptest(vec1, vec2); 4304 jccb(Assembler::notZero, BREAK_LOOP); 4305 addptr(len, 16); 4306 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4307 4308 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4309 jcc(Assembler::zero, DONE); 4310 4311 // Quick test using the already prepared vector mask 4312 movl(len, result); 4313 andl(len, 0x0000000f); // tail count (in bytes) 4314 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4315 ptest(vec1, vec2); 4316 jcc(Assembler::zero, DONE); 4317 jmpb(TAIL_START); 4318 4319 bind(BREAK_LOOP); 4320 // At least one byte in the last 16-byte vector is negative. 4321 // Set up and look at the last 16 bytes as if they were a tail 4322 lea(ary1, Address(ary1, len, Address::times_1)); 4323 addptr(result, len); 4324 // Ignore the very last byte: if all others are positive, 4325 // it must be negative, so we can skip right to the 2+1 byte 4326 // end comparison at this point 4327 orl(result, 15); 4328 movl(len, 15); 4329 // Fallthru to tail compare 4330 } 4331 } 4332 4333 bind(TAIL_START); 4334 // Compare 4-byte vectors 4335 andl(len, 0xfffffffc); // vector count (in bytes) 4336 jccb(Assembler::zero, COMPARE_CHAR); 4337 4338 lea(ary1, Address(ary1, len, Address::times_1)); 4339 negptr(len); 4340 4341 bind(COMPARE_VECTORS); 4342 movl(tmp1, Address(ary1, len, Address::times_1)); 4343 andl(tmp1, 0x80808080); 4344 jccb(Assembler::notZero, TAIL_ADJUST); 4345 addptr(len, 4); 4346 jccb(Assembler::notZero, COMPARE_VECTORS); 4347 4348 // Compare trailing char (final 2-3 bytes), if any 4349 bind(COMPARE_CHAR); 4350 4351 testl(result, 0x2); // tail char 4352 jccb(Assembler::zero, COMPARE_BYTE); 4353 load_unsigned_short(tmp1, Address(ary1, 0)); 4354 andl(tmp1, 0x00008080); 4355 jccb(Assembler::notZero, CHAR_ADJUST); 4356 lea(ary1, Address(ary1, 2)); 4357 4358 bind(COMPARE_BYTE); 4359 testl(result, 0x1); // tail byte 4360 jccb(Assembler::zero, DONE); 4361 load_unsigned_byte(tmp1, Address(ary1, 0)); 4362 testl(tmp1, 0x00000080); 4363 jccb(Assembler::zero, DONE); 4364 subptr(result, 1); 4365 jmpb(DONE); 4366 4367 bind(TAIL_ADJUST); 4368 // there are negative bits in the last 4 byte block. 4369 // Adjust result and check the next three bytes 4370 addptr(result, len); 4371 orl(result, 3); 4372 lea(ary1, Address(ary1, len, Address::times_1)); 4373 jmpb(COMPARE_CHAR); 4374 4375 bind(CHAR_ADJUST); 4376 // We are looking at a char + optional byte tail, and found that one 4377 // of the bytes in the char is negative. Adjust the result, check the 4378 // first byte and readjust if needed. 4379 andl(result, 0xfffffffc); 4380 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4381 jccb(Assembler::notZero, DONE); 4382 addptr(result, 1); 4383 4384 // That's it 4385 bind(DONE); 4386 if (UseAVX >= 2 && UseSSE >= 2) { 4387 // clean upper bits of YMM registers 4388 vpxor(vec1, vec1); 4389 vpxor(vec2, vec2); 4390 } 4391 } 4392 4393 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4394 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4395 Register limit, Register result, Register chr, 4396 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 4397 ShortBranchVerifier sbv(this); 4398 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4399 4400 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4401 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4402 4403 if (is_array_equ) { 4404 // Check the input args 4405 cmpoop(ary1, ary2); 4406 jcc(Assembler::equal, TRUE_LABEL); 4407 4408 // Need additional checks for arrays_equals. 4409 testptr(ary1, ary1); 4410 jcc(Assembler::zero, FALSE_LABEL); 4411 testptr(ary2, ary2); 4412 jcc(Assembler::zero, FALSE_LABEL); 4413 4414 // Check the lengths 4415 movl(limit, Address(ary1, length_offset)); 4416 cmpl(limit, Address(ary2, length_offset)); 4417 jcc(Assembler::notEqual, FALSE_LABEL); 4418 } 4419 4420 // count == 0 4421 testl(limit, limit); 4422 jcc(Assembler::zero, TRUE_LABEL); 4423 4424 if (is_array_equ) { 4425 // Load array address 4426 lea(ary1, Address(ary1, base_offset)); 4427 lea(ary2, Address(ary2, base_offset)); 4428 } 4429 4430 if (is_array_equ && is_char) { 4431 // arrays_equals when used for char[]. 4432 shll(limit, 1); // byte count != 0 4433 } 4434 movl(result, limit); // copy 4435 4436 if (UseAVX >= 2) { 4437 // With AVX2, use 32-byte vector compare 4438 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4439 4440 // Compare 32-byte vectors 4441 andl(result, 0x0000001f); // tail count (in bytes) 4442 andl(limit, 0xffffffe0); // vector count (in bytes) 4443 jcc(Assembler::zero, COMPARE_TAIL); 4444 4445 lea(ary1, Address(ary1, limit, Address::times_1)); 4446 lea(ary2, Address(ary2, limit, Address::times_1)); 4447 negptr(limit); 4448 4449 #ifdef _LP64 4450 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4451 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4452 4453 cmpl(limit, -64); 4454 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4455 4456 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4457 4458 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4459 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4460 kortestql(mask, mask); 4461 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4462 addptr(limit, 64); // update since we already compared at this addr 4463 cmpl(limit, -64); 4464 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4465 4466 // At this point we may still need to compare -limit+result bytes. 4467 // We could execute the next two instruction and just continue via non-wide path: 4468 // cmpl(limit, 0); 4469 // jcc(Assembler::equal, COMPARE_TAIL); // true 4470 // But since we stopped at the points ary{1,2}+limit which are 4471 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4472 // (|limit| <= 32 and result < 32), 4473 // we may just compare the last 64 bytes. 4474 // 4475 addptr(result, -64); // it is safe, bc we just came from this area 4476 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4477 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4478 kortestql(mask, mask); 4479 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4480 4481 jmp(TRUE_LABEL); 4482 4483 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4484 4485 }//if (VM_Version::supports_avx512vlbw()) 4486 #endif //_LP64 4487 bind(COMPARE_WIDE_VECTORS); 4488 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 4489 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4490 vpxor(vec1, vec2); 4491 4492 vptest(vec1, vec1); 4493 jcc(Assembler::notZero, FALSE_LABEL); 4494 addptr(limit, 32); 4495 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4496 4497 testl(result, result); 4498 jcc(Assembler::zero, TRUE_LABEL); 4499 4500 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 4501 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4502 vpxor(vec1, vec2); 4503 4504 vptest(vec1, vec1); 4505 jccb(Assembler::notZero, FALSE_LABEL); 4506 jmpb(TRUE_LABEL); 4507 4508 bind(COMPARE_TAIL); // limit is zero 4509 movl(limit, result); 4510 // Fallthru to tail compare 4511 } else if (UseSSE42Intrinsics) { 4512 // With SSE4.2, use double quad vector compare 4513 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4514 4515 // Compare 16-byte vectors 4516 andl(result, 0x0000000f); // tail count (in bytes) 4517 andl(limit, 0xfffffff0); // vector count (in bytes) 4518 jcc(Assembler::zero, COMPARE_TAIL); 4519 4520 lea(ary1, Address(ary1, limit, Address::times_1)); 4521 lea(ary2, Address(ary2, limit, Address::times_1)); 4522 negptr(limit); 4523 4524 bind(COMPARE_WIDE_VECTORS); 4525 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4526 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4527 pxor(vec1, vec2); 4528 4529 ptest(vec1, vec1); 4530 jcc(Assembler::notZero, FALSE_LABEL); 4531 addptr(limit, 16); 4532 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4533 4534 testl(result, result); 4535 jcc(Assembler::zero, TRUE_LABEL); 4536 4537 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4538 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4539 pxor(vec1, vec2); 4540 4541 ptest(vec1, vec1); 4542 jccb(Assembler::notZero, FALSE_LABEL); 4543 jmpb(TRUE_LABEL); 4544 4545 bind(COMPARE_TAIL); // limit is zero 4546 movl(limit, result); 4547 // Fallthru to tail compare 4548 } 4549 4550 // Compare 4-byte vectors 4551 andl(limit, 0xfffffffc); // vector count (in bytes) 4552 jccb(Assembler::zero, COMPARE_CHAR); 4553 4554 lea(ary1, Address(ary1, limit, Address::times_1)); 4555 lea(ary2, Address(ary2, limit, Address::times_1)); 4556 negptr(limit); 4557 4558 bind(COMPARE_VECTORS); 4559 movl(chr, Address(ary1, limit, Address::times_1)); 4560 cmpl(chr, Address(ary2, limit, Address::times_1)); 4561 jccb(Assembler::notEqual, FALSE_LABEL); 4562 addptr(limit, 4); 4563 jcc(Assembler::notZero, COMPARE_VECTORS); 4564 4565 // Compare trailing char (final 2 bytes), if any 4566 bind(COMPARE_CHAR); 4567 testl(result, 0x2); // tail char 4568 jccb(Assembler::zero, COMPARE_BYTE); 4569 load_unsigned_short(chr, Address(ary1, 0)); 4570 load_unsigned_short(limit, Address(ary2, 0)); 4571 cmpl(chr, limit); 4572 jccb(Assembler::notEqual, FALSE_LABEL); 4573 4574 if (is_array_equ && is_char) { 4575 bind(COMPARE_BYTE); 4576 } else { 4577 lea(ary1, Address(ary1, 2)); 4578 lea(ary2, Address(ary2, 2)); 4579 4580 bind(COMPARE_BYTE); 4581 testl(result, 0x1); // tail byte 4582 jccb(Assembler::zero, TRUE_LABEL); 4583 load_unsigned_byte(chr, Address(ary1, 0)); 4584 load_unsigned_byte(limit, Address(ary2, 0)); 4585 cmpl(chr, limit); 4586 jccb(Assembler::notEqual, FALSE_LABEL); 4587 } 4588 bind(TRUE_LABEL); 4589 movl(result, 1); // return true 4590 jmpb(DONE); 4591 4592 bind(FALSE_LABEL); 4593 xorl(result, result); // return false 4594 4595 // That's it 4596 bind(DONE); 4597 if (UseAVX >= 2) { 4598 // clean upper bits of YMM registers 4599 vpxor(vec1, vec1); 4600 vpxor(vec2, vec2); 4601 } 4602 } 4603 4604 #ifdef _LP64 4605 4606 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4607 #define __ masm. 4608 Register dst = stub.data<0>(); 4609 XMMRegister src = stub.data<1>(); 4610 address target = stub.data<2>(); 4611 __ bind(stub.entry()); 4612 __ subptr(rsp, 8); 4613 __ movdbl(Address(rsp), src); 4614 __ call(RuntimeAddress(target)); 4615 __ pop(dst); 4616 __ jmp(stub.continuation()); 4617 #undef __ 4618 } 4619 4620 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4621 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4622 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4623 4624 address slowpath_target; 4625 if (dst_bt == T_INT) { 4626 if (src_bt == T_FLOAT) { 4627 cvttss2sil(dst, src); 4628 cmpl(dst, 0x80000000); 4629 slowpath_target = StubRoutines::x86::f2i_fixup(); 4630 } else { 4631 cvttsd2sil(dst, src); 4632 cmpl(dst, 0x80000000); 4633 slowpath_target = StubRoutines::x86::d2i_fixup(); 4634 } 4635 } else { 4636 if (src_bt == T_FLOAT) { 4637 cvttss2siq(dst, src); 4638 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4639 slowpath_target = StubRoutines::x86::f2l_fixup(); 4640 } else { 4641 cvttsd2siq(dst, src); 4642 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4643 slowpath_target = StubRoutines::x86::d2l_fixup(); 4644 } 4645 } 4646 4647 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4648 jcc(Assembler::equal, stub->entry()); 4649 bind(stub->continuation()); 4650 } 4651 4652 #endif // _LP64 4653 4654 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4655 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4656 switch(ideal_opc) { 4657 case Op_LShiftVS: 4658 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4659 case Op_LShiftVI: 4660 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4661 case Op_LShiftVL: 4662 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4663 case Op_RShiftVS: 4664 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4665 case Op_RShiftVI: 4666 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4667 case Op_RShiftVL: 4668 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4669 case Op_URShiftVS: 4670 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4671 case Op_URShiftVI: 4672 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4673 case Op_URShiftVL: 4674 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4675 case Op_RotateRightV: 4676 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4677 case Op_RotateLeftV: 4678 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4679 default: 4680 fatal("Unsupported masked operation"); break; 4681 } 4682 } 4683 4684 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4685 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4686 bool is_varshift) { 4687 switch (ideal_opc) { 4688 case Op_AddVB: 4689 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4690 case Op_AddVS: 4691 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4692 case Op_AddVI: 4693 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4694 case Op_AddVL: 4695 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4696 case Op_AddVF: 4697 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4698 case Op_AddVD: 4699 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4700 case Op_SubVB: 4701 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4702 case Op_SubVS: 4703 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4704 case Op_SubVI: 4705 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4706 case Op_SubVL: 4707 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4708 case Op_SubVF: 4709 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4710 case Op_SubVD: 4711 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4712 case Op_MulVS: 4713 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4714 case Op_MulVI: 4715 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4716 case Op_MulVL: 4717 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4718 case Op_MulVF: 4719 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4720 case Op_MulVD: 4721 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4722 case Op_DivVF: 4723 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4724 case Op_DivVD: 4725 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4726 case Op_SqrtVF: 4727 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4728 case Op_SqrtVD: 4729 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4730 case Op_AbsVB: 4731 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4732 case Op_AbsVS: 4733 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4734 case Op_AbsVI: 4735 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4736 case Op_AbsVL: 4737 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4738 case Op_FmaVF: 4739 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4740 case Op_FmaVD: 4741 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4742 case Op_VectorRearrange: 4743 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4744 case Op_LShiftVS: 4745 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4746 case Op_LShiftVI: 4747 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4748 case Op_LShiftVL: 4749 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4750 case Op_RShiftVS: 4751 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4752 case Op_RShiftVI: 4753 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4754 case Op_RShiftVL: 4755 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4756 case Op_URShiftVS: 4757 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4758 case Op_URShiftVI: 4759 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4760 case Op_URShiftVL: 4761 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4762 case Op_RotateLeftV: 4763 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4764 case Op_RotateRightV: 4765 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4766 case Op_MaxV: 4767 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4768 case Op_MinV: 4769 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4770 case Op_XorV: 4771 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4772 case Op_OrV: 4773 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4774 case Op_AndV: 4775 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4776 default: 4777 fatal("Unsupported masked operation"); break; 4778 } 4779 } 4780 4781 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4782 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4783 switch (ideal_opc) { 4784 case Op_AddVB: 4785 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4786 case Op_AddVS: 4787 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4788 case Op_AddVI: 4789 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4790 case Op_AddVL: 4791 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4792 case Op_AddVF: 4793 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4794 case Op_AddVD: 4795 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4796 case Op_SubVB: 4797 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4798 case Op_SubVS: 4799 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4800 case Op_SubVI: 4801 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4802 case Op_SubVL: 4803 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4804 case Op_SubVF: 4805 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4806 case Op_SubVD: 4807 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4808 case Op_MulVS: 4809 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4810 case Op_MulVI: 4811 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4812 case Op_MulVL: 4813 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4814 case Op_MulVF: 4815 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4816 case Op_MulVD: 4817 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4818 case Op_DivVF: 4819 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4820 case Op_DivVD: 4821 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4822 case Op_FmaVF: 4823 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4824 case Op_FmaVD: 4825 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4826 case Op_MaxV: 4827 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4828 case Op_MinV: 4829 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4830 case Op_XorV: 4831 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4832 case Op_OrV: 4833 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4834 case Op_AndV: 4835 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4836 default: 4837 fatal("Unsupported masked operation"); break; 4838 } 4839 } 4840 4841 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4842 KRegister src1, KRegister src2) { 4843 BasicType etype = T_ILLEGAL; 4844 switch(mask_len) { 4845 case 2: 4846 case 4: 4847 case 8: etype = T_BYTE; break; 4848 case 16: etype = T_SHORT; break; 4849 case 32: etype = T_INT; break; 4850 case 64: etype = T_LONG; break; 4851 default: fatal("Unsupported type"); break; 4852 } 4853 assert(etype != T_ILLEGAL, ""); 4854 switch(ideal_opc) { 4855 case Op_AndVMask: 4856 kand(etype, dst, src1, src2); break; 4857 case Op_OrVMask: 4858 kor(etype, dst, src1, src2); break; 4859 case Op_XorVMask: 4860 kxor(etype, dst, src1, src2); break; 4861 default: 4862 fatal("Unsupported masked operation"); break; 4863 } 4864 } 4865 4866 /* 4867 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4868 * If src is NaN, the result is 0. 4869 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4870 * the result is equal to the value of Integer.MIN_VALUE. 4871 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4872 * the result is equal to the value of Integer.MAX_VALUE. 4873 */ 4874 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4875 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4876 Register rscratch, AddressLiteral float_sign_flip, 4877 int vec_enc) { 4878 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4879 Label done; 4880 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4881 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4882 vptest(xtmp2, xtmp2, vec_enc); 4883 jccb(Assembler::equal, done); 4884 4885 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4886 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4887 4888 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4889 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4890 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4891 4892 // Recompute the mask for remaining special value. 4893 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4894 // Extract SRC values corresponding to TRUE mask lanes. 4895 vpand(xtmp4, xtmp2, src, vec_enc); 4896 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4897 // values are set. 4898 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4899 4900 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4901 bind(done); 4902 } 4903 4904 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4905 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4906 Register rscratch, AddressLiteral float_sign_flip, 4907 int vec_enc) { 4908 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4909 Label done; 4910 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4911 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4912 kortestwl(ktmp1, ktmp1); 4913 jccb(Assembler::equal, done); 4914 4915 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4916 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4917 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4918 4919 kxorwl(ktmp1, ktmp1, ktmp2); 4920 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4921 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4922 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4923 bind(done); 4924 } 4925 4926 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4927 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4928 Register rscratch, AddressLiteral double_sign_flip, 4929 int vec_enc) { 4930 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4931 4932 Label done; 4933 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4934 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4935 kortestwl(ktmp1, ktmp1); 4936 jccb(Assembler::equal, done); 4937 4938 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4939 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4940 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4941 4942 kxorwl(ktmp1, ktmp1, ktmp2); 4943 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4944 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4945 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4946 bind(done); 4947 } 4948 4949 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4950 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4951 Register rscratch, AddressLiteral float_sign_flip, 4952 int vec_enc) { 4953 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4954 Label done; 4955 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4956 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4957 kortestwl(ktmp1, ktmp1); 4958 jccb(Assembler::equal, done); 4959 4960 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4961 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4962 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4963 4964 kxorwl(ktmp1, ktmp1, ktmp2); 4965 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4966 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4967 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4968 bind(done); 4969 } 4970 4971 /* 4972 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4973 * If src is NaN, the result is 0. 4974 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4975 * the result is equal to the value of Long.MIN_VALUE. 4976 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4977 * the result is equal to the value of Long.MAX_VALUE. 4978 */ 4979 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4980 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4981 Register rscratch, AddressLiteral double_sign_flip, 4982 int vec_enc) { 4983 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4984 4985 Label done; 4986 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4987 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4988 kortestwl(ktmp1, ktmp1); 4989 jccb(Assembler::equal, done); 4990 4991 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4992 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4993 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4994 4995 kxorwl(ktmp1, ktmp1, ktmp2); 4996 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4997 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4998 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4999 bind(done); 5000 } 5001 5002 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5003 XMMRegister xtmp, int index, int vec_enc) { 5004 assert(vec_enc < Assembler::AVX_512bit, ""); 5005 if (vec_enc == Assembler::AVX_256bit) { 5006 vextractf128_high(xtmp, src); 5007 vshufps(dst, src, xtmp, index, vec_enc); 5008 } else { 5009 vshufps(dst, src, zero, index, vec_enc); 5010 } 5011 } 5012 5013 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5014 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5015 AddressLiteral float_sign_flip, int src_vec_enc) { 5016 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5017 5018 Label done; 5019 // Compare the destination lanes with float_sign_flip 5020 // value to get mask for all special values. 5021 movdqu(xtmp1, float_sign_flip, rscratch); 5022 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5023 ptest(xtmp2, xtmp2); 5024 jccb(Assembler::equal, done); 5025 5026 // Flip float_sign_flip to get max integer value. 5027 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5028 pxor(xtmp1, xtmp4); 5029 5030 // Set detination lanes corresponding to unordered source lanes as zero. 5031 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5032 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5033 5034 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5035 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5036 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5037 5038 // Recompute the mask for remaining special value. 5039 pxor(xtmp2, xtmp3); 5040 // Extract mask corresponding to non-negative source lanes. 5041 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5042 5043 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5044 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5045 pand(xtmp3, xtmp2); 5046 5047 // Replace destination lanes holding special value(0x80000000) with max int 5048 // if corresponding source lane holds a +ve value. 5049 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5050 bind(done); 5051 } 5052 5053 5054 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5055 XMMRegister xtmp, Register rscratch, int vec_enc) { 5056 switch(to_elem_bt) { 5057 case T_SHORT: 5058 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5059 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5060 vpackusdw(dst, dst, zero, vec_enc); 5061 if (vec_enc == Assembler::AVX_256bit) { 5062 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5063 } 5064 break; 5065 case T_BYTE: 5066 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5067 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5068 vpackusdw(dst, dst, zero, vec_enc); 5069 if (vec_enc == Assembler::AVX_256bit) { 5070 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5071 } 5072 vpackuswb(dst, dst, zero, vec_enc); 5073 break; 5074 default: assert(false, "%s", type2name(to_elem_bt)); 5075 } 5076 } 5077 5078 /* 5079 * Algorithm for vector D2L and F2I conversions:- 5080 * a) Perform vector D2L/F2I cast. 5081 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5082 * It signifies that source value could be any of the special floating point 5083 * values(NaN,-Inf,Inf,Max,-Min). 5084 * c) Set destination to zero if source is NaN value. 5085 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5086 */ 5087 5088 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5089 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5090 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5091 int to_elem_sz = type2aelembytes(to_elem_bt); 5092 assert(to_elem_sz <= 4, ""); 5093 vcvttps2dq(dst, src, vec_enc); 5094 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5095 if (to_elem_sz < 4) { 5096 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5097 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5098 } 5099 } 5100 5101 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5102 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5103 Register rscratch, int vec_enc) { 5104 int to_elem_sz = type2aelembytes(to_elem_bt); 5105 assert(to_elem_sz <= 4, ""); 5106 vcvttps2dq(dst, src, vec_enc); 5107 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5108 switch(to_elem_bt) { 5109 case T_INT: 5110 break; 5111 case T_SHORT: 5112 evpmovdw(dst, dst, vec_enc); 5113 break; 5114 case T_BYTE: 5115 evpmovdb(dst, dst, vec_enc); 5116 break; 5117 default: assert(false, "%s", type2name(to_elem_bt)); 5118 } 5119 } 5120 5121 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5122 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5123 Register rscratch, int vec_enc) { 5124 evcvttps2qq(dst, src, vec_enc); 5125 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5126 } 5127 5128 // Handling for downcasting from double to integer or sub-word types on AVX2. 5129 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5130 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5131 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5132 int to_elem_sz = type2aelembytes(to_elem_bt); 5133 assert(to_elem_sz < 8, ""); 5134 vcvttpd2dq(dst, src, vec_enc); 5135 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5136 float_sign_flip, vec_enc); 5137 if (to_elem_sz < 4) { 5138 // xtmp4 holds all zero lanes. 5139 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5140 } 5141 } 5142 5143 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5144 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5145 KRegister ktmp2, AddressLiteral sign_flip, 5146 Register rscratch, int vec_enc) { 5147 if (VM_Version::supports_avx512dq()) { 5148 evcvttpd2qq(dst, src, vec_enc); 5149 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5150 switch(to_elem_bt) { 5151 case T_LONG: 5152 break; 5153 case T_INT: 5154 evpmovsqd(dst, dst, vec_enc); 5155 break; 5156 case T_SHORT: 5157 evpmovsqd(dst, dst, vec_enc); 5158 evpmovdw(dst, dst, vec_enc); 5159 break; 5160 case T_BYTE: 5161 evpmovsqd(dst, dst, vec_enc); 5162 evpmovdb(dst, dst, vec_enc); 5163 break; 5164 default: assert(false, "%s", type2name(to_elem_bt)); 5165 } 5166 } else { 5167 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5168 vcvttpd2dq(dst, src, vec_enc); 5169 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5170 switch(to_elem_bt) { 5171 case T_INT: 5172 break; 5173 case T_SHORT: 5174 evpmovdw(dst, dst, vec_enc); 5175 break; 5176 case T_BYTE: 5177 evpmovdb(dst, dst, vec_enc); 5178 break; 5179 default: assert(false, "%s", type2name(to_elem_bt)); 5180 } 5181 } 5182 } 5183 5184 #ifdef _LP64 5185 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5186 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5187 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5188 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5189 // and re-instantiate original MXCSR.RC mode after that. 5190 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5191 5192 mov64(tmp, julong_cast(0.5L)); 5193 evpbroadcastq(xtmp1, tmp, vec_enc); 5194 vaddpd(xtmp1, src , xtmp1, vec_enc); 5195 evcvtpd2qq(dst, xtmp1, vec_enc); 5196 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5197 double_sign_flip, vec_enc);; 5198 5199 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5200 } 5201 5202 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5203 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5204 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5205 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5206 // and re-instantiate original MXCSR.RC mode after that. 5207 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5208 5209 movl(tmp, jint_cast(0.5)); 5210 movq(xtmp1, tmp); 5211 vbroadcastss(xtmp1, xtmp1, vec_enc); 5212 vaddps(xtmp1, src , xtmp1, vec_enc); 5213 vcvtps2dq(dst, xtmp1, vec_enc); 5214 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5215 float_sign_flip, vec_enc); 5216 5217 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5218 } 5219 5220 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5221 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5222 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5223 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5224 // and re-instantiate original MXCSR.RC mode after that. 5225 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5226 5227 movl(tmp, jint_cast(0.5)); 5228 movq(xtmp1, tmp); 5229 vbroadcastss(xtmp1, xtmp1, vec_enc); 5230 vaddps(xtmp1, src , xtmp1, vec_enc); 5231 vcvtps2dq(dst, xtmp1, vec_enc); 5232 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5233 5234 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5235 } 5236 #endif // _LP64 5237 5238 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5239 BasicType from_elem_bt, BasicType to_elem_bt) { 5240 switch (from_elem_bt) { 5241 case T_BYTE: 5242 switch (to_elem_bt) { 5243 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5244 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5245 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5246 default: ShouldNotReachHere(); 5247 } 5248 break; 5249 case T_SHORT: 5250 switch (to_elem_bt) { 5251 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5252 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5253 default: ShouldNotReachHere(); 5254 } 5255 break; 5256 case T_INT: 5257 assert(to_elem_bt == T_LONG, ""); 5258 vpmovzxdq(dst, src, vlen_enc); 5259 break; 5260 default: 5261 ShouldNotReachHere(); 5262 } 5263 } 5264 5265 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5266 BasicType from_elem_bt, BasicType to_elem_bt) { 5267 switch (from_elem_bt) { 5268 case T_BYTE: 5269 switch (to_elem_bt) { 5270 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5271 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5272 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5273 default: ShouldNotReachHere(); 5274 } 5275 break; 5276 case T_SHORT: 5277 switch (to_elem_bt) { 5278 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5279 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5280 default: ShouldNotReachHere(); 5281 } 5282 break; 5283 case T_INT: 5284 assert(to_elem_bt == T_LONG, ""); 5285 vpmovsxdq(dst, src, vlen_enc); 5286 break; 5287 default: 5288 ShouldNotReachHere(); 5289 } 5290 } 5291 5292 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5293 BasicType dst_bt, BasicType src_bt, int vlen) { 5294 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5295 assert(vlen_enc != AVX_512bit, ""); 5296 5297 int dst_bt_size = type2aelembytes(dst_bt); 5298 int src_bt_size = type2aelembytes(src_bt); 5299 if (dst_bt_size > src_bt_size) { 5300 switch (dst_bt_size / src_bt_size) { 5301 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5302 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5303 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5304 default: ShouldNotReachHere(); 5305 } 5306 } else { 5307 assert(dst_bt_size < src_bt_size, ""); 5308 switch (src_bt_size / dst_bt_size) { 5309 case 2: { 5310 if (vlen_enc == AVX_128bit) { 5311 vpacksswb(dst, src, src, vlen_enc); 5312 } else { 5313 vpacksswb(dst, src, src, vlen_enc); 5314 vpermq(dst, dst, 0x08, vlen_enc); 5315 } 5316 break; 5317 } 5318 case 4: { 5319 if (vlen_enc == AVX_128bit) { 5320 vpackssdw(dst, src, src, vlen_enc); 5321 vpacksswb(dst, dst, dst, vlen_enc); 5322 } else { 5323 vpackssdw(dst, src, src, vlen_enc); 5324 vpermq(dst, dst, 0x08, vlen_enc); 5325 vpacksswb(dst, dst, dst, AVX_128bit); 5326 } 5327 break; 5328 } 5329 case 8: { 5330 if (vlen_enc == AVX_128bit) { 5331 vpshufd(dst, src, 0x08, vlen_enc); 5332 vpackssdw(dst, dst, dst, vlen_enc); 5333 vpacksswb(dst, dst, dst, vlen_enc); 5334 } else { 5335 vpshufd(dst, src, 0x08, vlen_enc); 5336 vpermq(dst, dst, 0x08, vlen_enc); 5337 vpackssdw(dst, dst, dst, AVX_128bit); 5338 vpacksswb(dst, dst, dst, AVX_128bit); 5339 } 5340 break; 5341 } 5342 default: ShouldNotReachHere(); 5343 } 5344 } 5345 } 5346 5347 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5348 bool merge, BasicType bt, int vlen_enc) { 5349 if (bt == T_INT) { 5350 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5351 } else { 5352 assert(bt == T_LONG, ""); 5353 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5354 } 5355 } 5356 5357 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5358 bool merge, BasicType bt, int vlen_enc) { 5359 if (bt == T_INT) { 5360 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5361 } else { 5362 assert(bt == T_LONG, ""); 5363 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5364 } 5365 } 5366 5367 #ifdef _LP64 5368 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5369 Register rtmp2, XMMRegister xtmp, int mask_len, 5370 int vec_enc) { 5371 int index = 0; 5372 int vindex = 0; 5373 mov64(rtmp1, 0x0101010101010101L); 5374 pdepq(rtmp1, src, rtmp1); 5375 if (mask_len > 8) { 5376 movq(rtmp2, src); 5377 vpxor(xtmp, xtmp, xtmp, vec_enc); 5378 movq(xtmp, rtmp1); 5379 } 5380 movq(dst, rtmp1); 5381 5382 mask_len -= 8; 5383 while (mask_len > 0) { 5384 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5385 index++; 5386 if ((index % 2) == 0) { 5387 pxor(xtmp, xtmp); 5388 } 5389 mov64(rtmp1, 0x0101010101010101L); 5390 shrq(rtmp2, 8); 5391 pdepq(rtmp1, rtmp2, rtmp1); 5392 pinsrq(xtmp, rtmp1, index % 2); 5393 vindex = index / 2; 5394 if (vindex) { 5395 // Write entire 16 byte vector when both 64 bit 5396 // lanes are update to save redundant instructions. 5397 if (index % 2) { 5398 vinsertf128(dst, dst, xtmp, vindex); 5399 } 5400 } else { 5401 vmovdqu(dst, xtmp); 5402 } 5403 mask_len -= 8; 5404 } 5405 } 5406 5407 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5408 switch(opc) { 5409 case Op_VectorMaskTrueCount: 5410 popcntq(dst, tmp); 5411 break; 5412 case Op_VectorMaskLastTrue: 5413 if (VM_Version::supports_lzcnt()) { 5414 lzcntq(tmp, tmp); 5415 movl(dst, 63); 5416 subl(dst, tmp); 5417 } else { 5418 movl(dst, -1); 5419 bsrq(tmp, tmp); 5420 cmov32(Assembler::notZero, dst, tmp); 5421 } 5422 break; 5423 case Op_VectorMaskFirstTrue: 5424 if (VM_Version::supports_bmi1()) { 5425 if (masklen < 32) { 5426 orl(tmp, 1 << masklen); 5427 tzcntl(dst, tmp); 5428 } else if (masklen == 32) { 5429 tzcntl(dst, tmp); 5430 } else { 5431 assert(masklen == 64, ""); 5432 tzcntq(dst, tmp); 5433 } 5434 } else { 5435 if (masklen < 32) { 5436 orl(tmp, 1 << masklen); 5437 bsfl(dst, tmp); 5438 } else { 5439 assert(masklen == 32 || masklen == 64, ""); 5440 movl(dst, masklen); 5441 if (masklen == 32) { 5442 bsfl(tmp, tmp); 5443 } else { 5444 bsfq(tmp, tmp); 5445 } 5446 cmov32(Assembler::notZero, dst, tmp); 5447 } 5448 } 5449 break; 5450 case Op_VectorMaskToLong: 5451 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5452 break; 5453 default: assert(false, "Unhandled mask operation"); 5454 } 5455 } 5456 5457 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5458 int masklen, int masksize, int vec_enc) { 5459 assert(VM_Version::supports_popcnt(), ""); 5460 5461 if(VM_Version::supports_avx512bw()) { 5462 kmovql(tmp, mask); 5463 } else { 5464 assert(masklen <= 16, ""); 5465 kmovwl(tmp, mask); 5466 } 5467 5468 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5469 // operations needs to be clipped. 5470 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5471 andq(tmp, (1 << masklen) - 1); 5472 } 5473 5474 vector_mask_operation_helper(opc, dst, tmp, masklen); 5475 } 5476 5477 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5478 Register tmp, int masklen, BasicType bt, int vec_enc) { 5479 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5480 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5481 assert(VM_Version::supports_popcnt(), ""); 5482 5483 bool need_clip = false; 5484 switch(bt) { 5485 case T_BOOLEAN: 5486 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5487 vpxor(xtmp, xtmp, xtmp, vec_enc); 5488 vpsubb(xtmp, xtmp, mask, vec_enc); 5489 vpmovmskb(tmp, xtmp, vec_enc); 5490 need_clip = masklen < 16; 5491 break; 5492 case T_BYTE: 5493 vpmovmskb(tmp, mask, vec_enc); 5494 need_clip = masklen < 16; 5495 break; 5496 case T_SHORT: 5497 vpacksswb(xtmp, mask, mask, vec_enc); 5498 if (masklen >= 16) { 5499 vpermpd(xtmp, xtmp, 8, vec_enc); 5500 } 5501 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5502 need_clip = masklen < 16; 5503 break; 5504 case T_INT: 5505 case T_FLOAT: 5506 vmovmskps(tmp, mask, vec_enc); 5507 need_clip = masklen < 4; 5508 break; 5509 case T_LONG: 5510 case T_DOUBLE: 5511 vmovmskpd(tmp, mask, vec_enc); 5512 need_clip = masklen < 2; 5513 break; 5514 default: assert(false, "Unhandled type, %s", type2name(bt)); 5515 } 5516 5517 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5518 // operations needs to be clipped. 5519 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5520 // need_clip implies masklen < 32 5521 andq(tmp, (1 << masklen) - 1); 5522 } 5523 5524 vector_mask_operation_helper(opc, dst, tmp, masklen); 5525 } 5526 5527 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5528 Register rtmp2, int mask_len) { 5529 kmov(rtmp1, src); 5530 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5531 mov64(rtmp2, -1L); 5532 pextq(rtmp2, rtmp2, rtmp1); 5533 kmov(dst, rtmp2); 5534 } 5535 5536 #ifdef _LP64 5537 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5538 XMMRegister mask, Register rtmp, Register rscratch, 5539 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5540 int vec_enc) { 5541 assert(type2aelembytes(bt) >= 4, ""); 5542 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5543 address compress_perm_table = nullptr; 5544 address expand_perm_table = nullptr; 5545 if (type2aelembytes(bt) == 8) { 5546 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5547 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5548 vmovmskpd(rtmp, mask, vec_enc); 5549 } else { 5550 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5551 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5552 vmovmskps(rtmp, mask, vec_enc); 5553 } 5554 shlq(rtmp, 5); // for 32 byte permute row. 5555 if (opcode == Op_CompressV) { 5556 lea(rscratch, ExternalAddress(compress_perm_table)); 5557 } else { 5558 lea(rscratch, ExternalAddress(expand_perm_table)); 5559 } 5560 addptr(rtmp, rscratch); 5561 vmovdqu(permv, Address(rtmp)); 5562 vpermps(dst, permv, src, Assembler::AVX_256bit); 5563 vpxor(xtmp, xtmp, xtmp, vec_enc); 5564 // Blend the result with zero vector using permute mask, each column entry 5565 // in a permute table row contains either a valid permute index or a -1 (default) 5566 // value, this can potentially be used as a blending mask after 5567 // compressing/expanding the source vector lanes. 5568 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5569 } 5570 #endif 5571 5572 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5573 bool merge, BasicType bt, int vec_enc) { 5574 if (opcode == Op_CompressV) { 5575 switch(bt) { 5576 case T_BYTE: 5577 evpcompressb(dst, mask, src, merge, vec_enc); 5578 break; 5579 case T_CHAR: 5580 case T_SHORT: 5581 evpcompressw(dst, mask, src, merge, vec_enc); 5582 break; 5583 case T_INT: 5584 evpcompressd(dst, mask, src, merge, vec_enc); 5585 break; 5586 case T_FLOAT: 5587 evcompressps(dst, mask, src, merge, vec_enc); 5588 break; 5589 case T_LONG: 5590 evpcompressq(dst, mask, src, merge, vec_enc); 5591 break; 5592 case T_DOUBLE: 5593 evcompresspd(dst, mask, src, merge, vec_enc); 5594 break; 5595 default: 5596 fatal("Unsupported type %s", type2name(bt)); 5597 break; 5598 } 5599 } else { 5600 assert(opcode == Op_ExpandV, ""); 5601 switch(bt) { 5602 case T_BYTE: 5603 evpexpandb(dst, mask, src, merge, vec_enc); 5604 break; 5605 case T_CHAR: 5606 case T_SHORT: 5607 evpexpandw(dst, mask, src, merge, vec_enc); 5608 break; 5609 case T_INT: 5610 evpexpandd(dst, mask, src, merge, vec_enc); 5611 break; 5612 case T_FLOAT: 5613 evexpandps(dst, mask, src, merge, vec_enc); 5614 break; 5615 case T_LONG: 5616 evpexpandq(dst, mask, src, merge, vec_enc); 5617 break; 5618 case T_DOUBLE: 5619 evexpandpd(dst, mask, src, merge, vec_enc); 5620 break; 5621 default: 5622 fatal("Unsupported type %s", type2name(bt)); 5623 break; 5624 } 5625 } 5626 } 5627 #endif 5628 5629 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5630 KRegister ktmp1, int vec_enc) { 5631 if (opcode == Op_SignumVD) { 5632 vsubpd(dst, zero, one, vec_enc); 5633 // if src < 0 ? -1 : 1 5634 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5635 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5636 // if src == NaN, -0.0 or 0.0 return src. 5637 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5638 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5639 } else { 5640 assert(opcode == Op_SignumVF, ""); 5641 vsubps(dst, zero, one, vec_enc); 5642 // if src < 0 ? -1 : 1 5643 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5644 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5645 // if src == NaN, -0.0 or 0.0 return src. 5646 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5647 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5648 } 5649 } 5650 5651 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5652 XMMRegister xtmp1, int vec_enc) { 5653 if (opcode == Op_SignumVD) { 5654 vsubpd(dst, zero, one, vec_enc); 5655 // if src < 0 ? -1 : 1 5656 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5657 // if src == NaN, -0.0 or 0.0 return src. 5658 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5659 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5660 } else { 5661 assert(opcode == Op_SignumVF, ""); 5662 vsubps(dst, zero, one, vec_enc); 5663 // if src < 0 ? -1 : 1 5664 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5665 // if src == NaN, -0.0 or 0.0 return src. 5666 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5667 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5668 } 5669 } 5670 5671 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5672 if (VM_Version::supports_avx512bw()) { 5673 if (mask_len > 32) { 5674 kmovql(dst, src); 5675 } else { 5676 kmovdl(dst, src); 5677 if (mask_len != 32) { 5678 kshiftrdl(dst, dst, 32 - mask_len); 5679 } 5680 } 5681 } else { 5682 assert(mask_len <= 16, ""); 5683 kmovwl(dst, src); 5684 if (mask_len != 16) { 5685 kshiftrwl(dst, dst, 16 - mask_len); 5686 } 5687 } 5688 } 5689 5690 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5691 int lane_size = type2aelembytes(bt); 5692 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5693 if ((is_LP64 || lane_size < 8) && 5694 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5695 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5696 movptr(rtmp, imm32); 5697 switch(lane_size) { 5698 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5699 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5700 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5701 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5702 fatal("Unsupported lane size %d", lane_size); 5703 break; 5704 } 5705 } else { 5706 movptr(rtmp, imm32); 5707 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5708 switch(lane_size) { 5709 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5710 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5711 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5712 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5713 fatal("Unsupported lane size %d", lane_size); 5714 break; 5715 } 5716 } 5717 } 5718 5719 // 5720 // Following is lookup table based popcount computation algorithm:- 5721 // Index Bit set count 5722 // [ 0000 -> 0, 5723 // 0001 -> 1, 5724 // 0010 -> 1, 5725 // 0011 -> 2, 5726 // 0100 -> 1, 5727 // 0101 -> 2, 5728 // 0110 -> 2, 5729 // 0111 -> 3, 5730 // 1000 -> 1, 5731 // 1001 -> 2, 5732 // 1010 -> 3, 5733 // 1011 -> 3, 5734 // 1100 -> 2, 5735 // 1101 -> 3, 5736 // 1111 -> 4 ] 5737 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5738 // shuffle indices for lookup table access. 5739 // b. Right shift each byte of vector lane by 4 positions. 5740 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5741 // shuffle indices for lookup table access. 5742 // d. Add the bitset count of upper and lower 4 bits of each byte. 5743 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5744 // count of all the bytes of a quadword. 5745 // f. Perform step e. for upper 128bit vector lane. 5746 // g. Pack the bitset count of quadwords back to double word. 5747 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5748 5749 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5750 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5751 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5752 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5753 vpsrlw(dst, src, 4, vec_enc); 5754 vpand(dst, dst, xtmp1, vec_enc); 5755 vpand(xtmp1, src, xtmp1, vec_enc); 5756 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5757 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5758 vpshufb(dst, xtmp2, dst, vec_enc); 5759 vpaddb(dst, dst, xtmp1, vec_enc); 5760 } 5761 5762 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5763 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5764 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5765 // Following code is as per steps e,f,g and h of above algorithm. 5766 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5767 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5768 vpsadbw(dst, dst, xtmp2, vec_enc); 5769 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5770 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5771 vpackuswb(dst, xtmp1, dst, vec_enc); 5772 } 5773 5774 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5775 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5776 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5777 // Add the popcount of upper and lower bytes of word. 5778 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5779 vpsrlw(dst, xtmp1, 8, vec_enc); 5780 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5781 vpaddw(dst, dst, xtmp1, vec_enc); 5782 } 5783 5784 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5785 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5786 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5787 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5788 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5789 } 5790 5791 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5792 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5793 switch(bt) { 5794 case T_LONG: 5795 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5796 break; 5797 case T_INT: 5798 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5799 break; 5800 case T_CHAR: 5801 case T_SHORT: 5802 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5803 break; 5804 case T_BYTE: 5805 case T_BOOLEAN: 5806 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5807 break; 5808 default: 5809 fatal("Unsupported type %s", type2name(bt)); 5810 break; 5811 } 5812 } 5813 5814 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5815 KRegister mask, bool merge, int vec_enc) { 5816 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5817 switch(bt) { 5818 case T_LONG: 5819 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5820 evpopcntq(dst, mask, src, merge, vec_enc); 5821 break; 5822 case T_INT: 5823 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5824 evpopcntd(dst, mask, src, merge, vec_enc); 5825 break; 5826 case T_CHAR: 5827 case T_SHORT: 5828 assert(VM_Version::supports_avx512_bitalg(), ""); 5829 evpopcntw(dst, mask, src, merge, vec_enc); 5830 break; 5831 case T_BYTE: 5832 case T_BOOLEAN: 5833 assert(VM_Version::supports_avx512_bitalg(), ""); 5834 evpopcntb(dst, mask, src, merge, vec_enc); 5835 break; 5836 default: 5837 fatal("Unsupported type %s", type2name(bt)); 5838 break; 5839 } 5840 } 5841 5842 #ifndef _LP64 5843 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5844 assert(VM_Version::supports_avx512bw(), ""); 5845 kmovdl(tmp, src); 5846 kunpckdql(dst, tmp, tmp); 5847 } 5848 #endif 5849 5850 // Bit reversal algorithm first reverses the bits of each byte followed by 5851 // a byte level reversal for multi-byte primitive types (short/int/long). 5852 // Algorithm performs a lookup table access to get reverse bit sequence 5853 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5854 // is obtained by swapping the reverse bit sequences of upper and lower 5855 // nibble of a byte. 5856 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5857 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5858 if (VM_Version::supports_avx512vlbw()) { 5859 5860 // Get the reverse bit sequence of lower nibble of each byte. 5861 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5862 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5863 evpandq(dst, xtmp2, src, vec_enc); 5864 vpshufb(dst, xtmp1, dst, vec_enc); 5865 vpsllq(dst, dst, 4, vec_enc); 5866 5867 // Get the reverse bit sequence of upper nibble of each byte. 5868 vpandn(xtmp2, xtmp2, src, vec_enc); 5869 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5870 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5871 5872 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5873 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5874 evporq(xtmp2, dst, xtmp2, vec_enc); 5875 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5876 5877 } else if(vec_enc == Assembler::AVX_512bit) { 5878 // Shift based bit reversal. 5879 assert(bt == T_LONG || bt == T_INT, ""); 5880 5881 // Swap lower and upper nibble of each byte. 5882 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5883 5884 // Swap two least and most significant bits of each nibble. 5885 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5886 5887 // Swap adjacent pair of bits. 5888 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5889 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5890 5891 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5892 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5893 } else { 5894 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5895 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5896 5897 // Get the reverse bit sequence of lower nibble of each byte. 5898 vpand(dst, xtmp2, src, vec_enc); 5899 vpshufb(dst, xtmp1, dst, vec_enc); 5900 vpsllq(dst, dst, 4, vec_enc); 5901 5902 // Get the reverse bit sequence of upper nibble of each byte. 5903 vpandn(xtmp2, xtmp2, src, vec_enc); 5904 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5905 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5906 5907 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5908 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5909 vpor(xtmp2, dst, xtmp2, vec_enc); 5910 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5911 } 5912 } 5913 5914 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5915 XMMRegister xtmp, Register rscratch) { 5916 assert(VM_Version::supports_gfni(), ""); 5917 assert(rscratch != noreg || always_reachable(mask), "missing"); 5918 5919 // Galois field instruction based bit reversal based on following algorithm. 5920 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5921 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5922 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5923 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5924 } 5925 5926 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5927 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5928 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5929 evpandq(dst, xtmp1, src, vec_enc); 5930 vpsllq(dst, dst, nbits, vec_enc); 5931 vpandn(xtmp1, xtmp1, src, vec_enc); 5932 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5933 evporq(dst, dst, xtmp1, vec_enc); 5934 } 5935 5936 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5937 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5938 // Shift based bit reversal. 5939 assert(VM_Version::supports_evex(), ""); 5940 switch(bt) { 5941 case T_LONG: 5942 // Swap upper and lower double word of each quad word. 5943 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5944 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5945 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5946 break; 5947 case T_INT: 5948 // Swap upper and lower word of each double word. 5949 evprord(xtmp1, k0, src, 16, true, vec_enc); 5950 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5951 break; 5952 case T_CHAR: 5953 case T_SHORT: 5954 // Swap upper and lower byte of each word. 5955 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5956 break; 5957 case T_BYTE: 5958 evmovdquq(dst, k0, src, true, vec_enc); 5959 break; 5960 default: 5961 fatal("Unsupported type %s", type2name(bt)); 5962 break; 5963 } 5964 } 5965 5966 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5967 if (bt == T_BYTE) { 5968 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5969 evmovdquq(dst, k0, src, true, vec_enc); 5970 } else { 5971 vmovdqu(dst, src); 5972 } 5973 return; 5974 } 5975 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5976 // pre-computed shuffle indices. 5977 switch(bt) { 5978 case T_LONG: 5979 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5980 break; 5981 case T_INT: 5982 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5983 break; 5984 case T_CHAR: 5985 case T_SHORT: 5986 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5987 break; 5988 default: 5989 fatal("Unsupported type %s", type2name(bt)); 5990 break; 5991 } 5992 vpshufb(dst, src, dst, vec_enc); 5993 } 5994 5995 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5996 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5997 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5998 assert(is_integral_type(bt), ""); 5999 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6000 assert(VM_Version::supports_avx512cd(), ""); 6001 switch(bt) { 6002 case T_LONG: 6003 evplzcntq(dst, ktmp, src, merge, vec_enc); 6004 break; 6005 case T_INT: 6006 evplzcntd(dst, ktmp, src, merge, vec_enc); 6007 break; 6008 case T_SHORT: 6009 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6010 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6011 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6012 vpunpckhwd(dst, xtmp1, src, vec_enc); 6013 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6014 vpackusdw(dst, xtmp2, dst, vec_enc); 6015 break; 6016 case T_BYTE: 6017 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6018 // accessing the lookup table. 6019 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6020 // accessing the lookup table. 6021 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6022 assert(VM_Version::supports_avx512bw(), ""); 6023 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6024 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6025 vpand(xtmp2, dst, src, vec_enc); 6026 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6027 vpsrlw(xtmp3, src, 4, vec_enc); 6028 vpand(xtmp3, dst, xtmp3, vec_enc); 6029 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6030 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6031 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6032 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6033 break; 6034 default: 6035 fatal("Unsupported type %s", type2name(bt)); 6036 break; 6037 } 6038 } 6039 6040 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6041 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6042 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6043 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6044 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6045 // accessing the lookup table. 6046 vpand(dst, xtmp2, src, vec_enc); 6047 vpshufb(dst, xtmp1, dst, vec_enc); 6048 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6049 // accessing the lookup table. 6050 vpsrlw(xtmp3, src, 4, vec_enc); 6051 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6052 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6053 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6054 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6055 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6056 vpaddb(dst, dst, xtmp2, vec_enc); 6057 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6058 } 6059 6060 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6061 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6062 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6063 // Add zero counts of lower byte and upper byte of a word if 6064 // upper byte holds a zero value. 6065 vpsrlw(xtmp3, src, 8, vec_enc); 6066 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6067 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6068 vpsllw(xtmp2, dst, 8, vec_enc); 6069 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6070 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6071 vpsrlw(dst, dst, 8, vec_enc); 6072 } 6073 6074 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6075 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6076 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6077 // hence biased exponent can be used to compute leading zero count as per 6078 // following formula:- 6079 // LZCNT = 32 - (biased_exp - 127) 6080 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6081 6082 // Broadcast 0xFF 6083 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6084 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6085 6086 // Extract biased exponent. 6087 vcvtdq2ps(dst, src, vec_enc); 6088 vpsrld(dst, dst, 23, vec_enc); 6089 vpand(dst, dst, xtmp1, vec_enc); 6090 6091 // Broadcast 127. 6092 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6093 // Exponent = biased_exp - 127 6094 vpsubd(dst, dst, xtmp1, vec_enc); 6095 6096 // Exponent = Exponent + 1 6097 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6098 vpaddd(dst, dst, xtmp3, vec_enc); 6099 6100 // Replace -ve exponent with zero, exponent is -ve when src 6101 // lane contains a zero value. 6102 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6103 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6104 6105 // Rematerialize broadcast 32. 6106 vpslld(xtmp1, xtmp3, 5, vec_enc); 6107 // Exponent is 32 if corresponding source lane contains max_int value. 6108 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6109 // LZCNT = 32 - exponent 6110 vpsubd(dst, xtmp1, dst, vec_enc); 6111 6112 // Replace LZCNT with a value 1 if corresponding source lane 6113 // contains max_int value. 6114 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6115 6116 // Replace biased_exp with 0 if source lane value is less than zero. 6117 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6118 vblendvps(dst, dst, xtmp2, src, vec_enc); 6119 } 6120 6121 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6122 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6123 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6124 // Add zero counts of lower word and upper word of a double word if 6125 // upper word holds a zero value. 6126 vpsrld(xtmp3, src, 16, vec_enc); 6127 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6128 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6129 vpslld(xtmp2, dst, 16, vec_enc); 6130 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6131 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6132 vpsrld(dst, dst, 16, vec_enc); 6133 // Add zero counts of lower doubleword and upper doubleword of a 6134 // quadword if upper doubleword holds a zero value. 6135 vpsrlq(xtmp3, src, 32, vec_enc); 6136 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6137 vpsllq(xtmp2, dst, 32, vec_enc); 6138 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6139 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6140 vpsrlq(dst, dst, 32, vec_enc); 6141 } 6142 6143 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6144 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6145 Register rtmp, int vec_enc) { 6146 assert(is_integral_type(bt), "unexpected type"); 6147 assert(vec_enc < Assembler::AVX_512bit, ""); 6148 switch(bt) { 6149 case T_LONG: 6150 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6151 break; 6152 case T_INT: 6153 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6154 break; 6155 case T_SHORT: 6156 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6157 break; 6158 case T_BYTE: 6159 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6160 break; 6161 default: 6162 fatal("Unsupported type %s", type2name(bt)); 6163 break; 6164 } 6165 } 6166 6167 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6168 switch(bt) { 6169 case T_BYTE: 6170 vpsubb(dst, src1, src2, vec_enc); 6171 break; 6172 case T_SHORT: 6173 vpsubw(dst, src1, src2, vec_enc); 6174 break; 6175 case T_INT: 6176 vpsubd(dst, src1, src2, vec_enc); 6177 break; 6178 case T_LONG: 6179 vpsubq(dst, src1, src2, vec_enc); 6180 break; 6181 default: 6182 fatal("Unsupported type %s", type2name(bt)); 6183 break; 6184 } 6185 } 6186 6187 // Trailing zero count computation is based on leading zero count operation as per 6188 // following equation. All AVX3 targets support AVX512CD feature which offers 6189 // direct vector instruction to compute leading zero count. 6190 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6191 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6192 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6193 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6194 assert(is_integral_type(bt), ""); 6195 // xtmp = -1 6196 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6197 // xtmp = xtmp + src 6198 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6199 // xtmp = xtmp & ~src 6200 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6201 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6202 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6203 vpsub(bt, dst, xtmp4, dst, vec_enc); 6204 } 6205 6206 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6207 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6208 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6209 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6210 assert(is_integral_type(bt), ""); 6211 // xtmp = 0 6212 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6213 // xtmp = 0 - src 6214 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6215 // xtmp = xtmp | src 6216 vpor(xtmp3, xtmp3, src, vec_enc); 6217 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6218 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6219 vpsub(bt, dst, xtmp1, dst, vec_enc); 6220 } 6221 6222 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6223 Label done; 6224 Label neg_divisor_fastpath; 6225 cmpl(divisor, 0); 6226 jccb(Assembler::less, neg_divisor_fastpath); 6227 xorl(rdx, rdx); 6228 divl(divisor); 6229 jmpb(done); 6230 bind(neg_divisor_fastpath); 6231 // Fastpath for divisor < 0: 6232 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6233 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6234 movl(rdx, rax); 6235 subl(rdx, divisor); 6236 if (VM_Version::supports_bmi1()) { 6237 andnl(rax, rdx, rax); 6238 } else { 6239 notl(rdx); 6240 andl(rax, rdx); 6241 } 6242 shrl(rax, 31); 6243 bind(done); 6244 } 6245 6246 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6247 Label done; 6248 Label neg_divisor_fastpath; 6249 cmpl(divisor, 0); 6250 jccb(Assembler::less, neg_divisor_fastpath); 6251 xorl(rdx, rdx); 6252 divl(divisor); 6253 jmpb(done); 6254 bind(neg_divisor_fastpath); 6255 // Fastpath when divisor < 0: 6256 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6257 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6258 movl(rdx, rax); 6259 subl(rax, divisor); 6260 if (VM_Version::supports_bmi1()) { 6261 andnl(rax, rax, rdx); 6262 } else { 6263 notl(rax); 6264 andl(rax, rdx); 6265 } 6266 sarl(rax, 31); 6267 andl(rax, divisor); 6268 subl(rdx, rax); 6269 bind(done); 6270 } 6271 6272 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6273 Label done; 6274 Label neg_divisor_fastpath; 6275 6276 cmpl(divisor, 0); 6277 jccb(Assembler::less, neg_divisor_fastpath); 6278 xorl(rdx, rdx); 6279 divl(divisor); 6280 jmpb(done); 6281 bind(neg_divisor_fastpath); 6282 // Fastpath for divisor < 0: 6283 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6284 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6285 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6286 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6287 movl(rdx, rax); 6288 subl(rax, divisor); 6289 if (VM_Version::supports_bmi1()) { 6290 andnl(rax, rax, rdx); 6291 } else { 6292 notl(rax); 6293 andl(rax, rdx); 6294 } 6295 movl(tmp, rax); 6296 shrl(rax, 31); // quotient 6297 sarl(tmp, 31); 6298 andl(tmp, divisor); 6299 subl(rdx, tmp); // remainder 6300 bind(done); 6301 } 6302 6303 #ifdef _LP64 6304 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6305 XMMRegister xtmp2, Register rtmp) { 6306 if(VM_Version::supports_gfni()) { 6307 // Galois field instruction based bit reversal based on following algorithm. 6308 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6309 mov64(rtmp, 0x8040201008040201L); 6310 movq(xtmp1, src); 6311 movq(xtmp2, rtmp); 6312 gf2p8affineqb(xtmp1, xtmp2, 0); 6313 movq(dst, xtmp1); 6314 } else { 6315 // Swap even and odd numbered bits. 6316 movl(rtmp, src); 6317 andl(rtmp, 0x55555555); 6318 shll(rtmp, 1); 6319 movl(dst, src); 6320 andl(dst, 0xAAAAAAAA); 6321 shrl(dst, 1); 6322 orl(dst, rtmp); 6323 6324 // Swap LSB and MSB 2 bits of each nibble. 6325 movl(rtmp, dst); 6326 andl(rtmp, 0x33333333); 6327 shll(rtmp, 2); 6328 andl(dst, 0xCCCCCCCC); 6329 shrl(dst, 2); 6330 orl(dst, rtmp); 6331 6332 // Swap LSB and MSB 4 bits of each byte. 6333 movl(rtmp, dst); 6334 andl(rtmp, 0x0F0F0F0F); 6335 shll(rtmp, 4); 6336 andl(dst, 0xF0F0F0F0); 6337 shrl(dst, 4); 6338 orl(dst, rtmp); 6339 } 6340 bswapl(dst); 6341 } 6342 6343 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6344 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6345 if(VM_Version::supports_gfni()) { 6346 // Galois field instruction based bit reversal based on following algorithm. 6347 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6348 mov64(rtmp1, 0x8040201008040201L); 6349 movq(xtmp1, src); 6350 movq(xtmp2, rtmp1); 6351 gf2p8affineqb(xtmp1, xtmp2, 0); 6352 movq(dst, xtmp1); 6353 } else { 6354 // Swap even and odd numbered bits. 6355 movq(rtmp1, src); 6356 mov64(rtmp2, 0x5555555555555555L); 6357 andq(rtmp1, rtmp2); 6358 shlq(rtmp1, 1); 6359 movq(dst, src); 6360 notq(rtmp2); 6361 andq(dst, rtmp2); 6362 shrq(dst, 1); 6363 orq(dst, rtmp1); 6364 6365 // Swap LSB and MSB 2 bits of each nibble. 6366 movq(rtmp1, dst); 6367 mov64(rtmp2, 0x3333333333333333L); 6368 andq(rtmp1, rtmp2); 6369 shlq(rtmp1, 2); 6370 notq(rtmp2); 6371 andq(dst, rtmp2); 6372 shrq(dst, 2); 6373 orq(dst, rtmp1); 6374 6375 // Swap LSB and MSB 4 bits of each byte. 6376 movq(rtmp1, dst); 6377 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6378 andq(rtmp1, rtmp2); 6379 shlq(rtmp1, 4); 6380 notq(rtmp2); 6381 andq(dst, rtmp2); 6382 shrq(dst, 4); 6383 orq(dst, rtmp1); 6384 } 6385 bswapq(dst); 6386 } 6387 6388 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6389 Label done; 6390 Label neg_divisor_fastpath; 6391 cmpq(divisor, 0); 6392 jccb(Assembler::less, neg_divisor_fastpath); 6393 xorl(rdx, rdx); 6394 divq(divisor); 6395 jmpb(done); 6396 bind(neg_divisor_fastpath); 6397 // Fastpath for divisor < 0: 6398 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6399 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6400 movq(rdx, rax); 6401 subq(rdx, divisor); 6402 if (VM_Version::supports_bmi1()) { 6403 andnq(rax, rdx, rax); 6404 } else { 6405 notq(rdx); 6406 andq(rax, rdx); 6407 } 6408 shrq(rax, 63); 6409 bind(done); 6410 } 6411 6412 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6413 Label done; 6414 Label neg_divisor_fastpath; 6415 cmpq(divisor, 0); 6416 jccb(Assembler::less, neg_divisor_fastpath); 6417 xorq(rdx, rdx); 6418 divq(divisor); 6419 jmp(done); 6420 bind(neg_divisor_fastpath); 6421 // Fastpath when divisor < 0: 6422 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6423 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6424 movq(rdx, rax); 6425 subq(rax, divisor); 6426 if (VM_Version::supports_bmi1()) { 6427 andnq(rax, rax, rdx); 6428 } else { 6429 notq(rax); 6430 andq(rax, rdx); 6431 } 6432 sarq(rax, 63); 6433 andq(rax, divisor); 6434 subq(rdx, rax); 6435 bind(done); 6436 } 6437 6438 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6439 Label done; 6440 Label neg_divisor_fastpath; 6441 cmpq(divisor, 0); 6442 jccb(Assembler::less, neg_divisor_fastpath); 6443 xorq(rdx, rdx); 6444 divq(divisor); 6445 jmp(done); 6446 bind(neg_divisor_fastpath); 6447 // Fastpath for divisor < 0: 6448 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6449 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6450 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6451 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6452 movq(rdx, rax); 6453 subq(rax, divisor); 6454 if (VM_Version::supports_bmi1()) { 6455 andnq(rax, rax, rdx); 6456 } else { 6457 notq(rax); 6458 andq(rax, rdx); 6459 } 6460 movq(tmp, rax); 6461 shrq(rax, 63); // quotient 6462 sarq(tmp, 63); 6463 andq(tmp, divisor); 6464 subq(rdx, tmp); // remainder 6465 bind(done); 6466 } 6467 #endif 6468 6469 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6470 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6471 int vlen_enc) { 6472 assert(VM_Version::supports_avx512bw(), ""); 6473 // Byte shuffles are inlane operations and indices are determined using 6474 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6475 // normalized to index range 0-15. This makes sure that all the multiples 6476 // of an index value are placed at same relative position in 128 bit 6477 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6478 // will be 16th element in their respective 128 bit lanes. 6479 movl(rtmp, 16); 6480 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6481 6482 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6483 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6484 // original shuffle indices and move the shuffled lanes corresponding to true 6485 // mask to destination vector. 6486 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6487 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6488 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6489 6490 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6491 // and broadcasting second 128 bit lane. 6492 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6493 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6494 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6495 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6496 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6497 6498 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6499 // and broadcasting third 128 bit lane. 6500 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6501 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6502 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6503 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6504 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6505 6506 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6507 // and broadcasting third 128 bit lane. 6508 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6509 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6510 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6511 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6512 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6513 } 6514 6515 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6516 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6517 if (vlen_enc == AVX_128bit) { 6518 vpermilps(dst, src, shuffle, vlen_enc); 6519 } else if (bt == T_INT) { 6520 vpermd(dst, shuffle, src, vlen_enc); 6521 } else { 6522 assert(bt == T_FLOAT, ""); 6523 vpermps(dst, shuffle, src, vlen_enc); 6524 } 6525 }