1 /* 2 * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 39 #ifdef PRODUCT 40 #define BLOCK_COMMENT(str) /* nothing */ 41 #define STOP(error) stop(error) 42 #else 43 #define BLOCK_COMMENT(str) block_comment(str) 44 #define STOP(error) block_comment(error); stop(error) 45 #endif 46 47 // C2 compiled method's prolog code. 48 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) { 49 if (C->clinit_barrier_on_entry()) { 50 assert(VM_Version::supports_fast_class_init_checks(), "sanity"); 51 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started"); 52 53 Label L_skip_barrier; 54 Register klass = rscratch1; 55 56 mov_metadata(klass, C->method()->holder()->constant_encoding()); 57 clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 58 59 jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 60 61 bind(L_skip_barrier); 62 } 63 64 int framesize = C->output()->frame_size_in_bytes(); 65 int bangsize = C->output()->bang_size_in_bytes(); 66 bool fp_mode_24b = false; 67 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0; 68 69 // WARNING: Initial instruction MUST be 5 bytes or longer so that 70 // NativeJump::patch_verified_entry will be able to patch out the entry 71 // code safely. The push to verify stack depth is ok at 5 bytes, 72 // the frame allocation can be either 3 or 6 bytes. So if we don't do 73 // stack bang then we must use the 6 byte frame allocation even if 74 // we have no frame. :-( 75 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 76 77 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 78 // Remove word for return addr 79 framesize -= wordSize; 80 stack_bang_size -= wordSize; 81 82 // Calls to C2R adapters often do not accept exceptional returns. 83 // We require that their callers must bang for them. But be careful, because 84 // some VM calls (such as call site linkage) can use several kilobytes of 85 // stack. But the stack safety zone should account for that. 86 // See bugs 4446381, 4468289, 4497237. 87 if (stack_bang_size > 0) { 88 generate_stack_overflow_check(stack_bang_size); 89 90 // We always push rbp, so that on return to interpreter rbp, will be 91 // restored correctly and we can correct the stack. 92 push(rbp); 93 // Save caller's stack pointer into RBP if the frame pointer is preserved. 94 if (PreserveFramePointer) { 95 mov(rbp, rsp); 96 } 97 // Remove word for ebp 98 framesize -= wordSize; 99 100 // Create frame 101 if (framesize) { 102 subptr(rsp, framesize); 103 } 104 } else { 105 // Create frame (force generation of a 4 byte immediate value) 106 subptr_imm32(rsp, framesize); 107 108 // Save RBP register now. 109 framesize -= wordSize; 110 movptr(Address(rsp, framesize), rbp); 111 // Save caller's stack pointer into RBP if the frame pointer is preserved. 112 if (PreserveFramePointer) { 113 movptr(rbp, rsp); 114 if (framesize > 0) { 115 addptr(rbp, framesize); 116 } 117 } 118 } 119 120 if (C->needs_stack_repair()) { 121 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp) 122 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned"); 123 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize); 124 } 125 126 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 127 framesize -= wordSize; 128 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 129 } 130 131 #ifndef _LP64 132 // If method sets FPU control word do it now 133 if (fp_mode_24b) { 134 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 135 } 136 if (UseSSE >= 2 && VerifyFPU) { 137 verify_FPU(0, "FPU stack must be clean on entry"); 138 } 139 #endif 140 141 #ifdef ASSERT 142 if (VerifyStackAtCalls) { 143 Label L; 144 push(rax); 145 mov(rax, rsp); 146 andptr(rax, StackAlignmentInBytes-1); 147 cmpptr(rax, StackAlignmentInBytes-wordSize); 148 pop(rax); 149 jcc(Assembler::equal, L); 150 STOP("Stack is not properly aligned!"); 151 bind(L); 152 } 153 #endif 154 } 155 156 void C2_MacroAssembler::entry_barrier() { 157 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 158 #ifdef _LP64 159 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 160 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 161 Label dummy_slow_path; 162 Label dummy_continuation; 163 Label* slow_path = &dummy_slow_path; 164 Label* continuation = &dummy_continuation; 165 if (!Compile::current()->output()->in_scratch_emit_size()) { 166 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 167 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 168 Compile::current()->output()->add_stub(stub); 169 slow_path = &stub->entry(); 170 continuation = &stub->continuation(); 171 } 172 bs->nmethod_entry_barrier(this, slow_path, continuation); 173 } 174 #else 175 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 176 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 177 #endif 178 } 179 180 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 181 switch (vlen_in_bytes) { 182 case 4: // fall-through 183 case 8: // fall-through 184 case 16: return Assembler::AVX_128bit; 185 case 32: return Assembler::AVX_256bit; 186 case 64: return Assembler::AVX_512bit; 187 188 default: { 189 ShouldNotReachHere(); 190 return Assembler::AVX_NoVec; 191 } 192 } 193 } 194 195 #if INCLUDE_RTM_OPT 196 197 // Update rtm_counters based on abort status 198 // input: abort_status 199 // rtm_counters (RTMLockingCounters*) 200 // flags are killed 201 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 202 203 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 204 if (PrintPreciseRTMLockingStatistics) { 205 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 206 Label check_abort; 207 testl(abort_status, (1<<i)); 208 jccb(Assembler::equal, check_abort); 209 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 210 bind(check_abort); 211 } 212 } 213 } 214 215 // Branch if (random & (count-1) != 0), count is 2^n 216 // tmp, scr and flags are killed 217 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 218 assert(tmp == rax, ""); 219 assert(scr == rdx, ""); 220 rdtsc(); // modifies EDX:EAX 221 andptr(tmp, count-1); 222 jccb(Assembler::notZero, brLabel); 223 } 224 225 // Perform abort ratio calculation, set no_rtm bit if high ratio 226 // input: rtm_counters_Reg (RTMLockingCounters* address) 227 // tmpReg, rtm_counters_Reg and flags are killed 228 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 229 Register rtm_counters_Reg, 230 RTMLockingCounters* rtm_counters, 231 Metadata* method_data) { 232 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 233 234 if (RTMLockingCalculationDelay > 0) { 235 // Delay calculation 236 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr())); 237 testptr(tmpReg, tmpReg); 238 jccb(Assembler::equal, L_done); 239 } 240 // Abort ratio calculation only if abort_count > RTMAbortThreshold 241 // Aborted transactions = abort_count * 100 242 // All transactions = total_count * RTMTotalCountIncrRate 243 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 244 245 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 246 cmpptr(tmpReg, RTMAbortThreshold); 247 jccb(Assembler::below, L_check_always_rtm2); 248 imulptr(tmpReg, tmpReg, 100); 249 250 Register scrReg = rtm_counters_Reg; 251 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 252 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 253 imulptr(scrReg, scrReg, RTMAbortRatio); 254 cmpptr(tmpReg, scrReg); 255 jccb(Assembler::below, L_check_always_rtm1); 256 if (method_data != nullptr) { 257 // set rtm_state to "no rtm" in MDO 258 mov_metadata(tmpReg, method_data); 259 lock(); 260 orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM); 261 } 262 jmpb(L_done); 263 bind(L_check_always_rtm1); 264 // Reload RTMLockingCounters* address 265 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 266 bind(L_check_always_rtm2); 267 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 268 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 269 jccb(Assembler::below, L_done); 270 if (method_data != nullptr) { 271 // set rtm_state to "always rtm" in MDO 272 mov_metadata(tmpReg, method_data); 273 lock(); 274 orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM); 275 } 276 bind(L_done); 277 } 278 279 // Update counters and perform abort ratio calculation 280 // input: abort_status_Reg 281 // rtm_counters_Reg, flags are killed 282 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 283 Register rtm_counters_Reg, 284 RTMLockingCounters* rtm_counters, 285 Metadata* method_data, 286 bool profile_rtm) { 287 288 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 289 // update rtm counters based on rax value at abort 290 // reads abort_status_Reg, updates flags 291 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 292 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 293 if (profile_rtm) { 294 // Save abort status because abort_status_Reg is used by following code. 295 if (RTMRetryCount > 0) { 296 push(abort_status_Reg); 297 } 298 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 299 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 300 // restore abort status 301 if (RTMRetryCount > 0) { 302 pop(abort_status_Reg); 303 } 304 } 305 } 306 307 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 308 // inputs: retry_count_Reg 309 // : abort_status_Reg 310 // output: retry_count_Reg decremented by 1 311 // flags are killed 312 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 313 Label doneRetry; 314 assert(abort_status_Reg == rax, ""); 315 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 316 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 317 // if reason is in 0x6 and retry count != 0 then retry 318 andptr(abort_status_Reg, 0x6); 319 jccb(Assembler::zero, doneRetry); 320 testl(retry_count_Reg, retry_count_Reg); 321 jccb(Assembler::zero, doneRetry); 322 pause(); 323 decrementl(retry_count_Reg); 324 jmp(retryLabel); 325 bind(doneRetry); 326 } 327 328 // Spin and retry if lock is busy, 329 // inputs: box_Reg (monitor address) 330 // : retry_count_Reg 331 // output: retry_count_Reg decremented by 1 332 // : clear z flag if retry count exceeded 333 // tmp_Reg, scr_Reg, flags are killed 334 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 335 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 336 Label SpinLoop, SpinExit, doneRetry; 337 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 338 339 testl(retry_count_Reg, retry_count_Reg); 340 jccb(Assembler::zero, doneRetry); 341 decrementl(retry_count_Reg); 342 movptr(scr_Reg, RTMSpinLoopCount); 343 344 bind(SpinLoop); 345 pause(); 346 decrementl(scr_Reg); 347 jccb(Assembler::lessEqual, SpinExit); 348 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 349 testptr(tmp_Reg, tmp_Reg); 350 jccb(Assembler::notZero, SpinLoop); 351 352 bind(SpinExit); 353 jmp(retryLabel); 354 bind(doneRetry); 355 incrementl(retry_count_Reg); // clear z flag 356 } 357 358 // Use RTM for normal stack locks 359 // Input: objReg (object to lock) 360 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 361 Register retry_on_abort_count_Reg, 362 RTMLockingCounters* stack_rtm_counters, 363 Metadata* method_data, bool profile_rtm, 364 Label& DONE_LABEL, Label& IsInflated) { 365 assert(UseRTMForStackLocks, "why call this otherwise?"); 366 assert(tmpReg == rax, ""); 367 assert(scrReg == rdx, ""); 368 Label L_rtm_retry, L_decrement_retry, L_on_abort; 369 370 if (RTMRetryCount > 0) { 371 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 372 bind(L_rtm_retry); 373 } 374 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 375 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 376 jcc(Assembler::notZero, IsInflated); 377 378 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 379 Label L_noincrement; 380 if (RTMTotalCountIncrRate > 1) { 381 // tmpReg, scrReg and flags are killed 382 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 383 } 384 assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM"); 385 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 386 bind(L_noincrement); 387 } 388 xbegin(L_on_abort); 389 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 390 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 391 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 392 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 393 394 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 395 if (UseRTMXendForLockBusy) { 396 xend(); 397 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 398 jmp(L_decrement_retry); 399 } 400 else { 401 xabort(0); 402 } 403 bind(L_on_abort); 404 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 405 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 406 } 407 bind(L_decrement_retry); 408 if (RTMRetryCount > 0) { 409 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 410 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 411 } 412 } 413 414 // Use RTM for inflating locks 415 // inputs: objReg (object to lock) 416 // boxReg (on-stack box address (displaced header location) - KILLED) 417 // tmpReg (ObjectMonitor address + markWord::monitor_value) 418 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 419 Register scrReg, Register retry_on_busy_count_Reg, 420 Register retry_on_abort_count_Reg, 421 RTMLockingCounters* rtm_counters, 422 Metadata* method_data, bool profile_rtm, 423 Label& DONE_LABEL) { 424 assert(UseRTMLocking, "why call this otherwise?"); 425 assert(tmpReg == rax, ""); 426 assert(scrReg == rdx, ""); 427 Label L_rtm_retry, L_decrement_retry, L_on_abort; 428 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 429 430 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 431 movptr(boxReg, tmpReg); // Save ObjectMonitor address 432 433 if (RTMRetryCount > 0) { 434 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 435 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 436 bind(L_rtm_retry); 437 } 438 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 439 Label L_noincrement; 440 if (RTMTotalCountIncrRate > 1) { 441 // tmpReg, scrReg and flags are killed 442 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 443 } 444 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 445 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 446 bind(L_noincrement); 447 } 448 xbegin(L_on_abort); 449 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 450 movptr(tmpReg, Address(tmpReg, owner_offset)); 451 testptr(tmpReg, tmpReg); 452 jcc(Assembler::zero, DONE_LABEL); 453 if (UseRTMXendForLockBusy) { 454 xend(); 455 jmp(L_decrement_retry); 456 } 457 else { 458 xabort(0); 459 } 460 bind(L_on_abort); 461 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 462 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 463 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 464 } 465 if (RTMRetryCount > 0) { 466 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 467 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 468 } 469 470 movptr(tmpReg, Address(boxReg, owner_offset)) ; 471 testptr(tmpReg, tmpReg) ; 472 jccb(Assembler::notZero, L_decrement_retry) ; 473 474 // Appears unlocked - try to swing _owner from null to non-null. 475 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 476 #ifdef _LP64 477 Register threadReg = r15_thread; 478 #else 479 get_thread(scrReg); 480 Register threadReg = scrReg; 481 #endif 482 lock(); 483 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 484 485 if (RTMRetryCount > 0) { 486 // success done else retry 487 jccb(Assembler::equal, DONE_LABEL) ; 488 bind(L_decrement_retry); 489 // Spin and retry if lock is busy. 490 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 491 } 492 else { 493 bind(L_decrement_retry); 494 } 495 } 496 497 #endif // INCLUDE_RTM_OPT 498 499 // fast_lock and fast_unlock used by C2 500 501 // Because the transitions from emitted code to the runtime 502 // monitorenter/exit helper stubs are so slow it's critical that 503 // we inline both the stack-locking fast path and the inflated fast path. 504 // 505 // See also: cmpFastLock and cmpFastUnlock. 506 // 507 // What follows is a specialized inline transliteration of the code 508 // in enter() and exit(). If we're concerned about I$ bloat another 509 // option would be to emit TrySlowEnter and TrySlowExit methods 510 // at startup-time. These methods would accept arguments as 511 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 512 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 513 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 514 // In practice, however, the # of lock sites is bounded and is usually small. 515 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 516 // if the processor uses simple bimodal branch predictors keyed by EIP 517 // Since the helper routines would be called from multiple synchronization 518 // sites. 519 // 520 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 521 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 522 // to those specialized methods. That'd give us a mostly platform-independent 523 // implementation that the JITs could optimize and inline at their pleasure. 524 // Done correctly, the only time we'd need to cross to native could would be 525 // to park() or unpark() threads. We'd also need a few more unsafe operators 526 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 527 // (b) explicit barriers or fence operations. 528 // 529 // TODO: 530 // 531 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 532 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 533 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 534 // the lock operators would typically be faster than reifying Self. 535 // 536 // * Ideally I'd define the primitives as: 537 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 538 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 539 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 540 // Instead, we're stuck with a rather awkward and brittle register assignments below. 541 // Furthermore the register assignments are overconstrained, possibly resulting in 542 // sub-optimal code near the synchronization site. 543 // 544 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 545 // Alternately, use a better sp-proximity test. 546 // 547 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 548 // Either one is sufficient to uniquely identify a thread. 549 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 550 // 551 // * Intrinsify notify() and notifyAll() for the common cases where the 552 // object is locked by the calling thread but the waitlist is empty. 553 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 554 // 555 // * use jccb and jmpb instead of jcc and jmp to improve code density. 556 // But beware of excessive branch density on AMD Opterons. 557 // 558 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 559 // or failure of the fast path. If the fast path fails then we pass 560 // control to the slow path, typically in C. In fast_lock and 561 // fast_unlock we often branch to DONE_LABEL, just to find that C2 562 // will emit a conditional branch immediately after the node. 563 // So we have branches to branches and lots of ICC.ZF games. 564 // Instead, it might be better to have C2 pass a "FailureLabel" 565 // into fast_lock and fast_unlock. In the case of success, control 566 // will drop through the node. ICC.ZF is undefined at exit. 567 // In the case of failure, the node will branch directly to the 568 // FailureLabel 569 570 571 // obj: object to lock 572 // box: on-stack box address (displaced header location) - KILLED 573 // rax,: tmp -- KILLED 574 // scr: tmp -- KILLED 575 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 576 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 577 RTMLockingCounters* rtm_counters, 578 RTMLockingCounters* stack_rtm_counters, 579 Metadata* method_data, 580 bool use_rtm, bool profile_rtm) { 581 // Ensure the register assignments are disjoint 582 assert(tmpReg == rax, ""); 583 584 if (use_rtm) { 585 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 586 } else { 587 assert(cx1Reg == noreg, ""); 588 assert(cx2Reg == noreg, ""); 589 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 590 } 591 592 // Possible cases that we'll encounter in fast_lock 593 // ------------------------------------------------ 594 // * Inflated 595 // -- unlocked 596 // -- Locked 597 // = by self 598 // = by other 599 // * neutral 600 // * stack-locked 601 // -- by self 602 // = sp-proximity test hits 603 // = sp-proximity test generates false-negative 604 // -- by other 605 // 606 607 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 608 609 if (DiagnoseSyncOnValueBasedClasses != 0) { 610 load_klass(tmpReg, objReg, scrReg); 611 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 612 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 613 jcc(Assembler::notZero, DONE_LABEL); 614 } 615 616 #if INCLUDE_RTM_OPT 617 if (UseRTMForStackLocks && use_rtm) { 618 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 619 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 620 stack_rtm_counters, method_data, profile_rtm, 621 DONE_LABEL, IsInflated); 622 } 623 #endif // INCLUDE_RTM_OPT 624 625 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 626 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 627 jcc(Assembler::notZero, IsInflated); 628 629 if (LockingMode == LM_MONITOR) { 630 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 631 testptr(objReg, objReg); 632 } else if (LockingMode == LM_LEGACY) { 633 // Attempt stack-locking ... 634 orptr (tmpReg, markWord::unlocked_value); 635 if (EnableValhalla) { 636 // Mask inline_type bit such that we go to the slow path if object is an inline type 637 andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place)); 638 } 639 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 640 lock(); 641 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 642 jcc(Assembler::equal, COUNT); // Success 643 644 // Recursive locking. 645 // The object is stack-locked: markword contains stack pointer to BasicLock. 646 // Locked by current thread if difference with current SP is less than one page. 647 subptr(tmpReg, rsp); 648 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 649 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 650 movptr(Address(boxReg, 0), tmpReg); 651 } else { 652 assert(LockingMode == LM_LIGHTWEIGHT, ""); 653 fast_lock_impl(objReg, tmpReg, thread, scrReg, NO_COUNT); 654 jmp(COUNT); 655 } 656 jmp(DONE_LABEL); 657 658 bind(IsInflated); 659 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 660 661 #if INCLUDE_RTM_OPT 662 // Use the same RTM locking code in 32- and 64-bit VM. 663 if (use_rtm) { 664 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 665 rtm_counters, method_data, profile_rtm, DONE_LABEL); 666 } else { 667 #endif // INCLUDE_RTM_OPT 668 669 #ifndef _LP64 670 // The object is inflated. 671 672 // boxReg refers to the on-stack BasicLock in the current frame. 673 // We'd like to write: 674 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 675 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 676 // additional latency as we have another ST in the store buffer that must drain. 677 678 // avoid ST-before-CAS 679 // register juggle because we need tmpReg for cmpxchgptr below 680 movptr(scrReg, boxReg); 681 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 682 683 // Optimistic form: consider XORL tmpReg,tmpReg 684 movptr(tmpReg, NULL_WORD); 685 686 // Appears unlocked - try to swing _owner from null to non-null. 687 // Ideally, I'd manifest "Self" with get_thread and then attempt 688 // to CAS the register containing Self into m->Owner. 689 // But we don't have enough registers, so instead we can either try to CAS 690 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 691 // we later store "Self" into m->Owner. Transiently storing a stack address 692 // (rsp or the address of the box) into m->owner is harmless. 693 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 694 lock(); 695 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 696 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 697 // If we weren't able to swing _owner from null to the BasicLock 698 // then take the slow path. 699 jccb (Assembler::notZero, NO_COUNT); 700 // update _owner from BasicLock to thread 701 get_thread (scrReg); // beware: clobbers ICCs 702 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 703 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 704 705 // If the CAS fails we can either retry or pass control to the slow path. 706 // We use the latter tactic. 707 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 708 // If the CAS was successful ... 709 // Self has acquired the lock 710 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 711 // Intentional fall-through into DONE_LABEL ... 712 #else // _LP64 713 // It's inflated and we use scrReg for ObjectMonitor* in this section. 714 movq(scrReg, tmpReg); 715 xorq(tmpReg, tmpReg); 716 lock(); 717 cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 718 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 719 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 720 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 721 // Propagate ICC.ZF from CAS above into DONE_LABEL. 722 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 723 724 cmpptr(thread, rax); // Check if we are already the owner (recursive lock) 725 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 726 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 727 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 728 #endif // _LP64 729 #if INCLUDE_RTM_OPT 730 } // use_rtm() 731 #endif 732 bind(DONE_LABEL); 733 734 // ZFlag == 1 count in fast path 735 // ZFlag == 0 count in slow path 736 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 737 738 bind(COUNT); 739 // Count monitors in fast path 740 increment(Address(thread, JavaThread::held_monitor_count_offset())); 741 742 xorl(tmpReg, tmpReg); // Set ZF == 1 743 744 bind(NO_COUNT); 745 746 // At NO_COUNT the icc ZFlag is set as follows ... 747 // fast_unlock uses the same protocol. 748 // ZFlag == 1 -> Success 749 // ZFlag == 0 -> Failure - force control through the slow path 750 } 751 752 // obj: object to unlock 753 // box: box address (displaced header location), killed. Must be EAX. 754 // tmp: killed, cannot be obj nor box. 755 // 756 // Some commentary on balanced locking: 757 // 758 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 759 // Methods that don't have provably balanced locking are forced to run in the 760 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 761 // The interpreter provides two properties: 762 // I1: At return-time the interpreter automatically and quietly unlocks any 763 // objects acquired the current activation (frame). Recall that the 764 // interpreter maintains an on-stack list of locks currently held by 765 // a frame. 766 // I2: If a method attempts to unlock an object that is not held by the 767 // the frame the interpreter throws IMSX. 768 // 769 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 770 // B() doesn't have provably balanced locking so it runs in the interpreter. 771 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 772 // is still locked by A(). 773 // 774 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 775 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 776 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 777 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 778 // Arguably given that the spec legislates the JNI case as undefined our implementation 779 // could reasonably *avoid* checking owner in fast_unlock(). 780 // In the interest of performance we elide m->Owner==Self check in unlock. 781 // A perfectly viable alternative is to elide the owner check except when 782 // Xcheck:jni is enabled. 783 784 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 785 assert(boxReg == rax, ""); 786 assert_different_registers(objReg, boxReg, tmpReg); 787 788 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 789 790 #if INCLUDE_RTM_OPT 791 if (UseRTMForStackLocks && use_rtm) { 792 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 793 Label L_regular_unlock; 794 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 795 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 796 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 797 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 798 xend(); // otherwise end... 799 jmp(DONE_LABEL); // ... and we're done 800 bind(L_regular_unlock); 801 } 802 #endif 803 804 if (LockingMode == LM_LEGACY) { 805 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 806 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 807 } 808 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 809 if (LockingMode != LM_MONITOR) { 810 testptr(tmpReg, markWord::monitor_value); // Inflated? 811 jcc(Assembler::zero, Stacked); 812 } 813 814 // It's inflated. 815 if (LockingMode == LM_LIGHTWEIGHT) { 816 // If the owner is ANONYMOUS, we need to fix it - in an outline stub. 817 testb(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) ObjectMonitor::ANONYMOUS_OWNER); 818 #ifdef _LP64 819 if (!Compile::current()->output()->in_scratch_emit_size()) { 820 C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg, boxReg); 821 Compile::current()->output()->add_stub(stub); 822 jcc(Assembler::notEqual, stub->entry()); 823 bind(stub->continuation()); 824 } else 825 #endif 826 { 827 // We can't easily implement this optimization on 32 bit because we don't have a thread register. 828 // Call the slow-path instead. 829 jcc(Assembler::notEqual, NO_COUNT); 830 } 831 } 832 833 #if INCLUDE_RTM_OPT 834 if (use_rtm) { 835 Label L_regular_inflated_unlock; 836 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 837 movptr(boxReg, Address(tmpReg, owner_offset)); 838 testptr(boxReg, boxReg); 839 jccb(Assembler::notZero, L_regular_inflated_unlock); 840 xend(); 841 jmp(DONE_LABEL); 842 bind(L_regular_inflated_unlock); 843 } 844 #endif 845 846 // Despite our balanced locking property we still check that m->_owner == Self 847 // as java routines or native JNI code called by this thread might 848 // have released the lock. 849 // Refer to the comments in synchronizer.cpp for how we might encode extra 850 // state in _succ so we can avoid fetching EntryList|cxq. 851 // 852 // If there's no contention try a 1-0 exit. That is, exit without 853 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 854 // we detect and recover from the race that the 1-0 exit admits. 855 // 856 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 857 // before it STs null into _owner, releasing the lock. Updates 858 // to data protected by the critical section must be visible before 859 // we drop the lock (and thus before any other thread could acquire 860 // the lock and observe the fields protected by the lock). 861 // IA32's memory-model is SPO, so STs are ordered with respect to 862 // each other and there's no need for an explicit barrier (fence). 863 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 864 #ifndef _LP64 865 // Note that we could employ various encoding schemes to reduce 866 // the number of loads below (currently 4) to just 2 or 3. 867 // Refer to the comments in synchronizer.cpp. 868 // In practice the chain of fetches doesn't seem to impact performance, however. 869 xorptr(boxReg, boxReg); 870 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 871 jccb (Assembler::notZero, DONE_LABEL); 872 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 873 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 874 jccb (Assembler::notZero, DONE_LABEL); 875 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 876 jmpb (DONE_LABEL); 877 #else // _LP64 878 // It's inflated 879 Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath; 880 881 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 882 jccb(Assembler::equal, LNotRecursive); 883 884 // Recursive inflated unlock 885 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 886 jmpb(LSuccess); 887 888 bind(LNotRecursive); 889 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 890 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 891 jccb (Assembler::notZero, CheckSucc); 892 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 893 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 894 jmpb (DONE_LABEL); 895 896 // Try to avoid passing control into the slow_path ... 897 bind (CheckSucc); 898 899 // The following optional optimization can be elided if necessary 900 // Effectively: if (succ == null) goto slow path 901 // The code reduces the window for a race, however, 902 // and thus benefits performance. 903 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 904 jccb (Assembler::zero, LGoSlowPath); 905 906 xorptr(boxReg, boxReg); 907 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 908 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 909 910 // Memory barrier/fence 911 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 912 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 913 // This is faster on Nehalem and AMD Shanghai/Barcelona. 914 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 915 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 916 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 917 lock(); addl(Address(rsp, 0), 0); 918 919 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 920 jccb (Assembler::notZero, LSuccess); 921 922 // Rare inopportune interleaving - race. 923 // The successor vanished in the small window above. 924 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 925 // We need to ensure progress and succession. 926 // Try to reacquire the lock. 927 // If that fails then the new owner is responsible for succession and this 928 // thread needs to take no further action and can exit via the fast path (success). 929 // If the re-acquire succeeds then pass control into the slow path. 930 // As implemented, this latter mode is horrible because we generated more 931 // coherence traffic on the lock *and* artificially extended the critical section 932 // length while by virtue of passing control into the slow path. 933 934 // box is really RAX -- the following CMPXCHG depends on that binding 935 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 936 lock(); 937 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 938 // There's no successor so we tried to regrab the lock. 939 // If that didn't work, then another thread grabbed the 940 // lock so we're done (and exit was a success). 941 jccb (Assembler::notEqual, LSuccess); 942 // Intentional fall-through into slow path 943 944 bind (LGoSlowPath); 945 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 946 jmpb (DONE_LABEL); 947 948 bind (LSuccess); 949 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 950 jmpb (DONE_LABEL); 951 952 #endif 953 if (LockingMode != LM_MONITOR) { 954 bind (Stacked); 955 if (LockingMode == LM_LIGHTWEIGHT) { 956 mov(boxReg, tmpReg); 957 fast_unlock_impl(objReg, boxReg, tmpReg, NO_COUNT); 958 jmp(COUNT); 959 } else if (LockingMode == LM_LEGACY) { 960 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 961 lock(); 962 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 963 } 964 // Intentional fall-thru into DONE_LABEL 965 } 966 bind(DONE_LABEL); 967 968 // ZFlag == 1 count in fast path 969 // ZFlag == 0 count in slow path 970 jccb(Assembler::notZero, NO_COUNT); 971 972 bind(COUNT); 973 // Count monitors in fast path 974 #ifndef _LP64 975 get_thread(tmpReg); 976 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 977 #else // _LP64 978 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 979 #endif 980 981 xorl(tmpReg, tmpReg); // Set ZF == 1 982 983 bind(NO_COUNT); 984 } 985 986 //------------------------------------------------------------------------------------------- 987 // Generic instructions support for use in .ad files C2 code generation 988 989 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 990 if (dst != src) { 991 movdqu(dst, src); 992 } 993 if (opcode == Op_AbsVD) { 994 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 995 } else { 996 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 997 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 998 } 999 } 1000 1001 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 1002 if (opcode == Op_AbsVD) { 1003 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 1004 } else { 1005 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 1006 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 1007 } 1008 } 1009 1010 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 1011 if (dst != src) { 1012 movdqu(dst, src); 1013 } 1014 if (opcode == Op_AbsVF) { 1015 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 1016 } else { 1017 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1018 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1019 } 1020 } 1021 1022 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 1023 if (opcode == Op_AbsVF) { 1024 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 1025 } else { 1026 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1027 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 1028 } 1029 } 1030 1031 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 1032 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1033 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 1034 1035 if (opcode == Op_MinV) { 1036 if (elem_bt == T_BYTE) { 1037 pminsb(dst, src); 1038 } else if (elem_bt == T_SHORT) { 1039 pminsw(dst, src); 1040 } else if (elem_bt == T_INT) { 1041 pminsd(dst, src); 1042 } else { 1043 assert(elem_bt == T_LONG, "required"); 1044 assert(tmp == xmm0, "required"); 1045 assert_different_registers(dst, src, tmp); 1046 movdqu(xmm0, dst); 1047 pcmpgtq(xmm0, src); 1048 blendvpd(dst, src); // xmm0 as mask 1049 } 1050 } else { // opcode == Op_MaxV 1051 if (elem_bt == T_BYTE) { 1052 pmaxsb(dst, src); 1053 } else if (elem_bt == T_SHORT) { 1054 pmaxsw(dst, src); 1055 } else if (elem_bt == T_INT) { 1056 pmaxsd(dst, src); 1057 } else { 1058 assert(elem_bt == T_LONG, "required"); 1059 assert(tmp == xmm0, "required"); 1060 assert_different_registers(dst, src, tmp); 1061 movdqu(xmm0, src); 1062 pcmpgtq(xmm0, dst); 1063 blendvpd(dst, src); // xmm0 as mask 1064 } 1065 } 1066 } 1067 1068 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1069 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1070 int vlen_enc) { 1071 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1072 1073 if (opcode == Op_MinV) { 1074 if (elem_bt == T_BYTE) { 1075 vpminsb(dst, src1, src2, vlen_enc); 1076 } else if (elem_bt == T_SHORT) { 1077 vpminsw(dst, src1, src2, vlen_enc); 1078 } else if (elem_bt == T_INT) { 1079 vpminsd(dst, src1, src2, vlen_enc); 1080 } else { 1081 assert(elem_bt == T_LONG, "required"); 1082 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1083 vpminsq(dst, src1, src2, vlen_enc); 1084 } else { 1085 assert_different_registers(dst, src1, src2); 1086 vpcmpgtq(dst, src1, src2, vlen_enc); 1087 vblendvpd(dst, src1, src2, dst, vlen_enc); 1088 } 1089 } 1090 } else { // opcode == Op_MaxV 1091 if (elem_bt == T_BYTE) { 1092 vpmaxsb(dst, src1, src2, vlen_enc); 1093 } else if (elem_bt == T_SHORT) { 1094 vpmaxsw(dst, src1, src2, vlen_enc); 1095 } else if (elem_bt == T_INT) { 1096 vpmaxsd(dst, src1, src2, vlen_enc); 1097 } else { 1098 assert(elem_bt == T_LONG, "required"); 1099 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1100 vpmaxsq(dst, src1, src2, vlen_enc); 1101 } else { 1102 assert_different_registers(dst, src1, src2); 1103 vpcmpgtq(dst, src1, src2, vlen_enc); 1104 vblendvpd(dst, src2, src1, dst, vlen_enc); 1105 } 1106 } 1107 } 1108 } 1109 1110 // Float/Double min max 1111 1112 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1113 XMMRegister dst, XMMRegister a, XMMRegister b, 1114 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1115 int vlen_enc) { 1116 assert(UseAVX > 0, "required"); 1117 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1118 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1119 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1120 assert_different_registers(a, b, tmp, atmp, btmp); 1121 1122 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1123 bool is_double_word = is_double_word_type(elem_bt); 1124 1125 if (!is_double_word && is_min) { 1126 vblendvps(atmp, a, b, a, vlen_enc); 1127 vblendvps(btmp, b, a, a, vlen_enc); 1128 vminps(tmp, atmp, btmp, vlen_enc); 1129 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1130 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1131 } else if (!is_double_word && !is_min) { 1132 vblendvps(btmp, b, a, b, vlen_enc); 1133 vblendvps(atmp, a, b, b, vlen_enc); 1134 vmaxps(tmp, atmp, btmp, vlen_enc); 1135 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1136 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1137 } else if (is_double_word && is_min) { 1138 vblendvpd(atmp, a, b, a, vlen_enc); 1139 vblendvpd(btmp, b, a, a, vlen_enc); 1140 vminpd(tmp, atmp, btmp, vlen_enc); 1141 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1142 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1143 } else { 1144 assert(is_double_word && !is_min, "sanity"); 1145 vblendvpd(btmp, b, a, b, vlen_enc); 1146 vblendvpd(atmp, a, b, b, vlen_enc); 1147 vmaxpd(tmp, atmp, btmp, vlen_enc); 1148 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1149 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1150 } 1151 } 1152 1153 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1154 XMMRegister dst, XMMRegister a, XMMRegister b, 1155 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1156 int vlen_enc) { 1157 assert(UseAVX > 2, "required"); 1158 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1159 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1160 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1161 assert_different_registers(dst, a, b, atmp, btmp); 1162 1163 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1164 bool is_double_word = is_double_word_type(elem_bt); 1165 bool merge = true; 1166 1167 if (!is_double_word && is_min) { 1168 evpmovd2m(ktmp, a, vlen_enc); 1169 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1170 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1171 vminps(dst, atmp, btmp, vlen_enc); 1172 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1173 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1174 } else if (!is_double_word && !is_min) { 1175 evpmovd2m(ktmp, b, vlen_enc); 1176 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1177 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1178 vmaxps(dst, atmp, btmp, vlen_enc); 1179 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1180 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1181 } else if (is_double_word && is_min) { 1182 evpmovq2m(ktmp, a, vlen_enc); 1183 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1184 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1185 vminpd(dst, atmp, btmp, vlen_enc); 1186 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1187 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1188 } else { 1189 assert(is_double_word && !is_min, "sanity"); 1190 evpmovq2m(ktmp, b, vlen_enc); 1191 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1192 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1193 vmaxpd(dst, atmp, btmp, vlen_enc); 1194 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1195 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1196 } 1197 } 1198 1199 // Float/Double signum 1200 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1201 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1202 1203 Label DONE_LABEL; 1204 1205 if (opcode == Op_SignumF) { 1206 assert(UseSSE > 0, "required"); 1207 ucomiss(dst, zero); 1208 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1209 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1210 movflt(dst, one); 1211 jcc(Assembler::above, DONE_LABEL); 1212 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1213 } else if (opcode == Op_SignumD) { 1214 assert(UseSSE > 1, "required"); 1215 ucomisd(dst, zero); 1216 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1217 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1218 movdbl(dst, one); 1219 jcc(Assembler::above, DONE_LABEL); 1220 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1221 } 1222 1223 bind(DONE_LABEL); 1224 } 1225 1226 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1227 if (sign) { 1228 pmovsxbw(dst, src); 1229 } else { 1230 pmovzxbw(dst, src); 1231 } 1232 } 1233 1234 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1235 if (sign) { 1236 vpmovsxbw(dst, src, vector_len); 1237 } else { 1238 vpmovzxbw(dst, src, vector_len); 1239 } 1240 } 1241 1242 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1243 if (sign) { 1244 vpmovsxbd(dst, src, vector_len); 1245 } else { 1246 vpmovzxbd(dst, src, vector_len); 1247 } 1248 } 1249 1250 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1251 if (sign) { 1252 vpmovsxwd(dst, src, vector_len); 1253 } else { 1254 vpmovzxwd(dst, src, vector_len); 1255 } 1256 } 1257 1258 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1259 int shift, int vector_len) { 1260 if (opcode == Op_RotateLeftV) { 1261 if (etype == T_INT) { 1262 evprold(dst, src, shift, vector_len); 1263 } else { 1264 assert(etype == T_LONG, "expected type T_LONG"); 1265 evprolq(dst, src, shift, vector_len); 1266 } 1267 } else { 1268 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1269 if (etype == T_INT) { 1270 evprord(dst, src, shift, vector_len); 1271 } else { 1272 assert(etype == T_LONG, "expected type T_LONG"); 1273 evprorq(dst, src, shift, vector_len); 1274 } 1275 } 1276 } 1277 1278 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1279 XMMRegister shift, int vector_len) { 1280 if (opcode == Op_RotateLeftV) { 1281 if (etype == T_INT) { 1282 evprolvd(dst, src, shift, vector_len); 1283 } else { 1284 assert(etype == T_LONG, "expected type T_LONG"); 1285 evprolvq(dst, src, shift, vector_len); 1286 } 1287 } else { 1288 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1289 if (etype == T_INT) { 1290 evprorvd(dst, src, shift, vector_len); 1291 } else { 1292 assert(etype == T_LONG, "expected type T_LONG"); 1293 evprorvq(dst, src, shift, vector_len); 1294 } 1295 } 1296 } 1297 1298 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1299 if (opcode == Op_RShiftVI) { 1300 psrad(dst, shift); 1301 } else if (opcode == Op_LShiftVI) { 1302 pslld(dst, shift); 1303 } else { 1304 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1305 psrld(dst, shift); 1306 } 1307 } 1308 1309 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1310 switch (opcode) { 1311 case Op_RShiftVI: psrad(dst, shift); break; 1312 case Op_LShiftVI: pslld(dst, shift); break; 1313 case Op_URShiftVI: psrld(dst, shift); break; 1314 1315 default: assert(false, "%s", NodeClassNames[opcode]); 1316 } 1317 } 1318 1319 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1320 if (opcode == Op_RShiftVI) { 1321 vpsrad(dst, nds, shift, vector_len); 1322 } else if (opcode == Op_LShiftVI) { 1323 vpslld(dst, nds, shift, vector_len); 1324 } else { 1325 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1326 vpsrld(dst, nds, shift, vector_len); 1327 } 1328 } 1329 1330 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1331 switch (opcode) { 1332 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1333 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1334 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1335 1336 default: assert(false, "%s", NodeClassNames[opcode]); 1337 } 1338 } 1339 1340 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1341 switch (opcode) { 1342 case Op_RShiftVB: // fall-through 1343 case Op_RShiftVS: psraw(dst, shift); break; 1344 1345 case Op_LShiftVB: // fall-through 1346 case Op_LShiftVS: psllw(dst, shift); break; 1347 1348 case Op_URShiftVS: // fall-through 1349 case Op_URShiftVB: psrlw(dst, shift); break; 1350 1351 default: assert(false, "%s", NodeClassNames[opcode]); 1352 } 1353 } 1354 1355 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1356 switch (opcode) { 1357 case Op_RShiftVB: // fall-through 1358 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1359 1360 case Op_LShiftVB: // fall-through 1361 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1362 1363 case Op_URShiftVS: // fall-through 1364 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1365 1366 default: assert(false, "%s", NodeClassNames[opcode]); 1367 } 1368 } 1369 1370 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1371 switch (opcode) { 1372 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1373 case Op_LShiftVL: psllq(dst, shift); break; 1374 case Op_URShiftVL: psrlq(dst, shift); break; 1375 1376 default: assert(false, "%s", NodeClassNames[opcode]); 1377 } 1378 } 1379 1380 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1381 if (opcode == Op_RShiftVL) { 1382 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1383 } else if (opcode == Op_LShiftVL) { 1384 psllq(dst, shift); 1385 } else { 1386 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1387 psrlq(dst, shift); 1388 } 1389 } 1390 1391 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1392 switch (opcode) { 1393 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1394 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1395 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1396 1397 default: assert(false, "%s", NodeClassNames[opcode]); 1398 } 1399 } 1400 1401 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1402 if (opcode == Op_RShiftVL) { 1403 evpsraq(dst, nds, shift, vector_len); 1404 } else if (opcode == Op_LShiftVL) { 1405 vpsllq(dst, nds, shift, vector_len); 1406 } else { 1407 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1408 vpsrlq(dst, nds, shift, vector_len); 1409 } 1410 } 1411 1412 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1413 switch (opcode) { 1414 case Op_RShiftVB: // fall-through 1415 case Op_RShiftVS: // fall-through 1416 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1417 1418 case Op_LShiftVB: // fall-through 1419 case Op_LShiftVS: // fall-through 1420 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1421 1422 case Op_URShiftVB: // fall-through 1423 case Op_URShiftVS: // fall-through 1424 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1425 1426 default: assert(false, "%s", NodeClassNames[opcode]); 1427 } 1428 } 1429 1430 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1431 switch (opcode) { 1432 case Op_RShiftVB: // fall-through 1433 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1434 1435 case Op_LShiftVB: // fall-through 1436 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1437 1438 case Op_URShiftVB: // fall-through 1439 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1440 1441 default: assert(false, "%s", NodeClassNames[opcode]); 1442 } 1443 } 1444 1445 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1446 assert(UseAVX >= 2, "required"); 1447 switch (opcode) { 1448 case Op_RShiftVL: { 1449 if (UseAVX > 2) { 1450 assert(tmp == xnoreg, "not used"); 1451 if (!VM_Version::supports_avx512vl()) { 1452 vlen_enc = Assembler::AVX_512bit; 1453 } 1454 evpsravq(dst, src, shift, vlen_enc); 1455 } else { 1456 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1457 vpsrlvq(dst, src, shift, vlen_enc); 1458 vpsrlvq(tmp, tmp, shift, vlen_enc); 1459 vpxor(dst, dst, tmp, vlen_enc); 1460 vpsubq(dst, dst, tmp, vlen_enc); 1461 } 1462 break; 1463 } 1464 case Op_LShiftVL: { 1465 assert(tmp == xnoreg, "not used"); 1466 vpsllvq(dst, src, shift, vlen_enc); 1467 break; 1468 } 1469 case Op_URShiftVL: { 1470 assert(tmp == xnoreg, "not used"); 1471 vpsrlvq(dst, src, shift, vlen_enc); 1472 break; 1473 } 1474 default: assert(false, "%s", NodeClassNames[opcode]); 1475 } 1476 } 1477 1478 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1479 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1480 assert(opcode == Op_LShiftVB || 1481 opcode == Op_RShiftVB || 1482 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1483 bool sign = (opcode != Op_URShiftVB); 1484 assert(vector_len == 0, "required"); 1485 vextendbd(sign, dst, src, 1); 1486 vpmovzxbd(vtmp, shift, 1); 1487 varshiftd(opcode, dst, dst, vtmp, 1); 1488 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1489 vextracti128_high(vtmp, dst); 1490 vpackusdw(dst, dst, vtmp, 0); 1491 } 1492 1493 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1494 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1495 assert(opcode == Op_LShiftVB || 1496 opcode == Op_RShiftVB || 1497 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1498 bool sign = (opcode != Op_URShiftVB); 1499 int ext_vector_len = vector_len + 1; 1500 vextendbw(sign, dst, src, ext_vector_len); 1501 vpmovzxbw(vtmp, shift, ext_vector_len); 1502 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1503 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1504 if (vector_len == 0) { 1505 vextracti128_high(vtmp, dst); 1506 vpackuswb(dst, dst, vtmp, vector_len); 1507 } else { 1508 vextracti64x4_high(vtmp, dst); 1509 vpackuswb(dst, dst, vtmp, vector_len); 1510 vpermq(dst, dst, 0xD8, vector_len); 1511 } 1512 } 1513 1514 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1515 switch(typ) { 1516 case T_BYTE: 1517 pinsrb(dst, val, idx); 1518 break; 1519 case T_SHORT: 1520 pinsrw(dst, val, idx); 1521 break; 1522 case T_INT: 1523 pinsrd(dst, val, idx); 1524 break; 1525 case T_LONG: 1526 pinsrq(dst, val, idx); 1527 break; 1528 default: 1529 assert(false,"Should not reach here."); 1530 break; 1531 } 1532 } 1533 1534 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1535 switch(typ) { 1536 case T_BYTE: 1537 vpinsrb(dst, src, val, idx); 1538 break; 1539 case T_SHORT: 1540 vpinsrw(dst, src, val, idx); 1541 break; 1542 case T_INT: 1543 vpinsrd(dst, src, val, idx); 1544 break; 1545 case T_LONG: 1546 vpinsrq(dst, src, val, idx); 1547 break; 1548 default: 1549 assert(false,"Should not reach here."); 1550 break; 1551 } 1552 } 1553 1554 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1555 switch(typ) { 1556 case T_INT: 1557 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1558 break; 1559 case T_FLOAT: 1560 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1561 break; 1562 case T_LONG: 1563 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1564 break; 1565 case T_DOUBLE: 1566 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1567 break; 1568 default: 1569 assert(false,"Should not reach here."); 1570 break; 1571 } 1572 } 1573 1574 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1575 switch(typ) { 1576 case T_INT: 1577 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1578 break; 1579 case T_FLOAT: 1580 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1581 break; 1582 case T_LONG: 1583 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1584 break; 1585 case T_DOUBLE: 1586 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1587 break; 1588 default: 1589 assert(false,"Should not reach here."); 1590 break; 1591 } 1592 } 1593 1594 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1595 switch(typ) { 1596 case T_INT: 1597 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1598 break; 1599 case T_FLOAT: 1600 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1601 break; 1602 case T_LONG: 1603 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1604 break; 1605 case T_DOUBLE: 1606 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1607 break; 1608 default: 1609 assert(false,"Should not reach here."); 1610 break; 1611 } 1612 } 1613 1614 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1615 if (vlen_in_bytes <= 16) { 1616 pxor (dst, dst); 1617 psubb(dst, src); 1618 switch (elem_bt) { 1619 case T_BYTE: /* nothing to do */ break; 1620 case T_SHORT: pmovsxbw(dst, dst); break; 1621 case T_INT: pmovsxbd(dst, dst); break; 1622 case T_FLOAT: pmovsxbd(dst, dst); break; 1623 case T_LONG: pmovsxbq(dst, dst); break; 1624 case T_DOUBLE: pmovsxbq(dst, dst); break; 1625 1626 default: assert(false, "%s", type2name(elem_bt)); 1627 } 1628 } else { 1629 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1630 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1631 1632 vpxor (dst, dst, dst, vlen_enc); 1633 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1634 1635 switch (elem_bt) { 1636 case T_BYTE: /* nothing to do */ break; 1637 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1638 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1639 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1640 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1641 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1642 1643 default: assert(false, "%s", type2name(elem_bt)); 1644 } 1645 } 1646 } 1647 1648 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1649 if (novlbwdq) { 1650 vpmovsxbd(xtmp, src, vlen_enc); 1651 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1652 Assembler::eq, true, vlen_enc, noreg); 1653 } else { 1654 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1655 vpsubb(xtmp, xtmp, src, vlen_enc); 1656 evpmovb2m(dst, xtmp, vlen_enc); 1657 } 1658 } 1659 1660 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1661 switch (vlen_in_bytes) { 1662 case 4: movdl(dst, src); break; 1663 case 8: movq(dst, src); break; 1664 case 16: movdqu(dst, src); break; 1665 case 32: vmovdqu(dst, src); break; 1666 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1667 default: ShouldNotReachHere(); 1668 } 1669 } 1670 1671 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1672 assert(rscratch != noreg || always_reachable(src), "missing"); 1673 1674 if (reachable(src)) { 1675 load_vector(dst, as_Address(src), vlen_in_bytes); 1676 } else { 1677 lea(rscratch, src); 1678 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1679 } 1680 } 1681 1682 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1683 int vlen_enc = vector_length_encoding(vlen); 1684 if (VM_Version::supports_avx()) { 1685 if (bt == T_LONG) { 1686 if (VM_Version::supports_avx2()) { 1687 vpbroadcastq(dst, src, vlen_enc); 1688 } else { 1689 vmovddup(dst, src, vlen_enc); 1690 } 1691 } else if (bt == T_DOUBLE) { 1692 if (vlen_enc != Assembler::AVX_128bit) { 1693 vbroadcastsd(dst, src, vlen_enc, noreg); 1694 } else { 1695 vmovddup(dst, src, vlen_enc); 1696 } 1697 } else { 1698 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1699 vpbroadcastd(dst, src, vlen_enc); 1700 } else { 1701 vbroadcastss(dst, src, vlen_enc); 1702 } 1703 } 1704 } else if (VM_Version::supports_sse3()) { 1705 movddup(dst, src); 1706 } else { 1707 movq(dst, src); 1708 if (vlen == 16) { 1709 punpcklqdq(dst, dst); 1710 } 1711 } 1712 } 1713 1714 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1715 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1716 int offset = exact_log2(type2aelembytes(bt)) << 6; 1717 if (is_floating_point_type(bt)) { 1718 offset += 128; 1719 } 1720 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1721 load_vector(dst, addr, vlen_in_bytes); 1722 } 1723 1724 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1725 1726 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1727 int vector_len = Assembler::AVX_128bit; 1728 1729 switch (opcode) { 1730 case Op_AndReductionV: pand(dst, src); break; 1731 case Op_OrReductionV: por (dst, src); break; 1732 case Op_XorReductionV: pxor(dst, src); break; 1733 case Op_MinReductionV: 1734 switch (typ) { 1735 case T_BYTE: pminsb(dst, src); break; 1736 case T_SHORT: pminsw(dst, src); break; 1737 case T_INT: pminsd(dst, src); break; 1738 case T_LONG: assert(UseAVX > 2, "required"); 1739 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1740 default: assert(false, "wrong type"); 1741 } 1742 break; 1743 case Op_MaxReductionV: 1744 switch (typ) { 1745 case T_BYTE: pmaxsb(dst, src); break; 1746 case T_SHORT: pmaxsw(dst, src); break; 1747 case T_INT: pmaxsd(dst, src); break; 1748 case T_LONG: assert(UseAVX > 2, "required"); 1749 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1750 default: assert(false, "wrong type"); 1751 } 1752 break; 1753 case Op_AddReductionVF: addss(dst, src); break; 1754 case Op_AddReductionVD: addsd(dst, src); break; 1755 case Op_AddReductionVI: 1756 switch (typ) { 1757 case T_BYTE: paddb(dst, src); break; 1758 case T_SHORT: paddw(dst, src); break; 1759 case T_INT: paddd(dst, src); break; 1760 default: assert(false, "wrong type"); 1761 } 1762 break; 1763 case Op_AddReductionVL: paddq(dst, src); break; 1764 case Op_MulReductionVF: mulss(dst, src); break; 1765 case Op_MulReductionVD: mulsd(dst, src); break; 1766 case Op_MulReductionVI: 1767 switch (typ) { 1768 case T_SHORT: pmullw(dst, src); break; 1769 case T_INT: pmulld(dst, src); break; 1770 default: assert(false, "wrong type"); 1771 } 1772 break; 1773 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1774 evpmullq(dst, dst, src, vector_len); break; 1775 default: assert(false, "wrong opcode"); 1776 } 1777 } 1778 1779 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1780 int vector_len = Assembler::AVX_256bit; 1781 1782 switch (opcode) { 1783 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1784 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1785 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1786 case Op_MinReductionV: 1787 switch (typ) { 1788 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1789 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1790 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1791 case T_LONG: assert(UseAVX > 2, "required"); 1792 vpminsq(dst, src1, src2, vector_len); break; 1793 default: assert(false, "wrong type"); 1794 } 1795 break; 1796 case Op_MaxReductionV: 1797 switch (typ) { 1798 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1799 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1800 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1801 case T_LONG: assert(UseAVX > 2, "required"); 1802 vpmaxsq(dst, src1, src2, vector_len); break; 1803 default: assert(false, "wrong type"); 1804 } 1805 break; 1806 case Op_AddReductionVI: 1807 switch (typ) { 1808 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1809 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1810 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1811 default: assert(false, "wrong type"); 1812 } 1813 break; 1814 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1815 case Op_MulReductionVI: 1816 switch (typ) { 1817 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1818 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1819 default: assert(false, "wrong type"); 1820 } 1821 break; 1822 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1823 default: assert(false, "wrong opcode"); 1824 } 1825 } 1826 1827 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1828 XMMRegister dst, XMMRegister src, 1829 XMMRegister vtmp1, XMMRegister vtmp2) { 1830 switch (opcode) { 1831 case Op_AddReductionVF: 1832 case Op_MulReductionVF: 1833 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1834 break; 1835 1836 case Op_AddReductionVD: 1837 case Op_MulReductionVD: 1838 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1839 break; 1840 1841 default: assert(false, "wrong opcode"); 1842 } 1843 } 1844 1845 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1846 Register dst, Register src1, XMMRegister src2, 1847 XMMRegister vtmp1, XMMRegister vtmp2) { 1848 switch (vlen) { 1849 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1850 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1851 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1852 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1853 1854 default: assert(false, "wrong vector length"); 1855 } 1856 } 1857 1858 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1859 Register dst, Register src1, XMMRegister src2, 1860 XMMRegister vtmp1, XMMRegister vtmp2) { 1861 switch (vlen) { 1862 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1863 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1864 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1865 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1866 1867 default: assert(false, "wrong vector length"); 1868 } 1869 } 1870 1871 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1872 Register dst, Register src1, XMMRegister src2, 1873 XMMRegister vtmp1, XMMRegister vtmp2) { 1874 switch (vlen) { 1875 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1876 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1877 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1878 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1879 1880 default: assert(false, "wrong vector length"); 1881 } 1882 } 1883 1884 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1885 Register dst, Register src1, XMMRegister src2, 1886 XMMRegister vtmp1, XMMRegister vtmp2) { 1887 switch (vlen) { 1888 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1889 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1890 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1891 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1892 1893 default: assert(false, "wrong vector length"); 1894 } 1895 } 1896 1897 #ifdef _LP64 1898 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1899 Register dst, Register src1, XMMRegister src2, 1900 XMMRegister vtmp1, XMMRegister vtmp2) { 1901 switch (vlen) { 1902 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1903 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1904 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1905 1906 default: assert(false, "wrong vector length"); 1907 } 1908 } 1909 #endif // _LP64 1910 1911 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1912 switch (vlen) { 1913 case 2: 1914 assert(vtmp2 == xnoreg, ""); 1915 reduce2F(opcode, dst, src, vtmp1); 1916 break; 1917 case 4: 1918 assert(vtmp2 == xnoreg, ""); 1919 reduce4F(opcode, dst, src, vtmp1); 1920 break; 1921 case 8: 1922 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1923 break; 1924 case 16: 1925 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1926 break; 1927 default: assert(false, "wrong vector length"); 1928 } 1929 } 1930 1931 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1932 switch (vlen) { 1933 case 2: 1934 assert(vtmp2 == xnoreg, ""); 1935 reduce2D(opcode, dst, src, vtmp1); 1936 break; 1937 case 4: 1938 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1939 break; 1940 case 8: 1941 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1942 break; 1943 default: assert(false, "wrong vector length"); 1944 } 1945 } 1946 1947 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1948 if (opcode == Op_AddReductionVI) { 1949 if (vtmp1 != src2) { 1950 movdqu(vtmp1, src2); 1951 } 1952 phaddd(vtmp1, vtmp1); 1953 } else { 1954 pshufd(vtmp1, src2, 0x1); 1955 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1956 } 1957 movdl(vtmp2, src1); 1958 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1959 movdl(dst, vtmp1); 1960 } 1961 1962 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1963 if (opcode == Op_AddReductionVI) { 1964 if (vtmp1 != src2) { 1965 movdqu(vtmp1, src2); 1966 } 1967 phaddd(vtmp1, src2); 1968 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1969 } else { 1970 pshufd(vtmp2, src2, 0xE); 1971 reduce_operation_128(T_INT, opcode, vtmp2, src2); 1972 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1973 } 1974 } 1975 1976 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1977 if (opcode == Op_AddReductionVI) { 1978 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1979 vextracti128_high(vtmp2, vtmp1); 1980 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1981 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1982 } else { 1983 vextracti128_high(vtmp1, src2); 1984 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1985 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1986 } 1987 } 1988 1989 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1990 vextracti64x4_high(vtmp2, src2); 1991 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 1992 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1993 } 1994 1995 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1996 pshufd(vtmp2, src2, 0x1); 1997 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1998 movdqu(vtmp1, vtmp2); 1999 psrldq(vtmp1, 2); 2000 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2001 movdqu(vtmp2, vtmp1); 2002 psrldq(vtmp2, 1); 2003 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2004 movdl(vtmp2, src1); 2005 pmovsxbd(vtmp1, vtmp1); 2006 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2007 pextrb(dst, vtmp1, 0x0); 2008 movsbl(dst, dst); 2009 } 2010 2011 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2012 pshufd(vtmp1, src2, 0xE); 2013 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2014 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2015 } 2016 2017 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2018 vextracti128_high(vtmp2, src2); 2019 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2020 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2021 } 2022 2023 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2024 vextracti64x4_high(vtmp1, src2); 2025 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2026 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2027 } 2028 2029 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2030 pmovsxbw(vtmp2, src2); 2031 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2032 } 2033 2034 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2035 if (UseAVX > 1) { 2036 int vector_len = Assembler::AVX_256bit; 2037 vpmovsxbw(vtmp1, src2, vector_len); 2038 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2039 } else { 2040 pmovsxbw(vtmp2, src2); 2041 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2042 pshufd(vtmp2, src2, 0x1); 2043 pmovsxbw(vtmp2, src2); 2044 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2045 } 2046 } 2047 2048 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2049 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2050 int vector_len = Assembler::AVX_512bit; 2051 vpmovsxbw(vtmp1, src2, vector_len); 2052 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2053 } else { 2054 assert(UseAVX >= 2,"Should not reach here."); 2055 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2056 vextracti128_high(vtmp2, src2); 2057 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2058 } 2059 } 2060 2061 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2062 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2063 vextracti64x4_high(vtmp2, src2); 2064 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2065 } 2066 2067 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2068 if (opcode == Op_AddReductionVI) { 2069 if (vtmp1 != src2) { 2070 movdqu(vtmp1, src2); 2071 } 2072 phaddw(vtmp1, vtmp1); 2073 phaddw(vtmp1, vtmp1); 2074 } else { 2075 pshufd(vtmp2, src2, 0x1); 2076 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2077 movdqu(vtmp1, vtmp2); 2078 psrldq(vtmp1, 2); 2079 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2080 } 2081 movdl(vtmp2, src1); 2082 pmovsxwd(vtmp1, vtmp1); 2083 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2084 pextrw(dst, vtmp1, 0x0); 2085 movswl(dst, dst); 2086 } 2087 2088 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2089 if (opcode == Op_AddReductionVI) { 2090 if (vtmp1 != src2) { 2091 movdqu(vtmp1, src2); 2092 } 2093 phaddw(vtmp1, src2); 2094 } else { 2095 pshufd(vtmp1, src2, 0xE); 2096 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2097 } 2098 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2099 } 2100 2101 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2102 if (opcode == Op_AddReductionVI) { 2103 int vector_len = Assembler::AVX_256bit; 2104 vphaddw(vtmp2, src2, src2, vector_len); 2105 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2106 } else { 2107 vextracti128_high(vtmp2, src2); 2108 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2109 } 2110 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2111 } 2112 2113 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2114 int vector_len = Assembler::AVX_256bit; 2115 vextracti64x4_high(vtmp1, src2); 2116 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2117 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2118 } 2119 2120 #ifdef _LP64 2121 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2122 pshufd(vtmp2, src2, 0xE); 2123 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2124 movdq(vtmp1, src1); 2125 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2126 movdq(dst, vtmp1); 2127 } 2128 2129 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2130 vextracti128_high(vtmp1, src2); 2131 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2132 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2133 } 2134 2135 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2136 vextracti64x4_high(vtmp2, src2); 2137 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2138 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2139 } 2140 2141 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2142 mov64(temp, -1L); 2143 bzhiq(temp, temp, len); 2144 kmovql(dst, temp); 2145 } 2146 #endif // _LP64 2147 2148 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2149 reduce_operation_128(T_FLOAT, opcode, dst, src); 2150 pshufd(vtmp, src, 0x1); 2151 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2152 } 2153 2154 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2155 reduce2F(opcode, dst, src, vtmp); 2156 pshufd(vtmp, src, 0x2); 2157 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2158 pshufd(vtmp, src, 0x3); 2159 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2160 } 2161 2162 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2163 reduce4F(opcode, dst, src, vtmp2); 2164 vextractf128_high(vtmp2, src); 2165 reduce4F(opcode, dst, vtmp2, vtmp1); 2166 } 2167 2168 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2169 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2170 vextracti64x4_high(vtmp1, src); 2171 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2172 } 2173 2174 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2175 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2176 pshufd(vtmp, src, 0xE); 2177 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2178 } 2179 2180 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2181 reduce2D(opcode, dst, src, vtmp2); 2182 vextractf128_high(vtmp2, src); 2183 reduce2D(opcode, dst, vtmp2, vtmp1); 2184 } 2185 2186 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2187 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2188 vextracti64x4_high(vtmp1, src); 2189 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2190 } 2191 2192 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2193 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2194 } 2195 2196 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2197 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2198 } 2199 2200 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2201 int vec_enc) { 2202 switch(elem_bt) { 2203 case T_INT: 2204 case T_FLOAT: 2205 vmaskmovps(dst, src, mask, vec_enc); 2206 break; 2207 case T_LONG: 2208 case T_DOUBLE: 2209 vmaskmovpd(dst, src, mask, vec_enc); 2210 break; 2211 default: 2212 fatal("Unsupported type %s", type2name(elem_bt)); 2213 break; 2214 } 2215 } 2216 2217 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2218 int vec_enc) { 2219 switch(elem_bt) { 2220 case T_INT: 2221 case T_FLOAT: 2222 vmaskmovps(dst, src, mask, vec_enc); 2223 break; 2224 case T_LONG: 2225 case T_DOUBLE: 2226 vmaskmovpd(dst, src, mask, vec_enc); 2227 break; 2228 default: 2229 fatal("Unsupported type %s", type2name(elem_bt)); 2230 break; 2231 } 2232 } 2233 2234 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2235 XMMRegister dst, XMMRegister src, 2236 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2237 XMMRegister xmm_0, XMMRegister xmm_1) { 2238 const int permconst[] = {1, 14}; 2239 XMMRegister wsrc = src; 2240 XMMRegister wdst = xmm_0; 2241 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2242 2243 int vlen_enc = Assembler::AVX_128bit; 2244 if (vlen == 16) { 2245 vlen_enc = Assembler::AVX_256bit; 2246 } 2247 2248 for (int i = log2(vlen) - 1; i >=0; i--) { 2249 if (i == 0 && !is_dst_valid) { 2250 wdst = dst; 2251 } 2252 if (i == 3) { 2253 vextracti64x4_high(wtmp, wsrc); 2254 } else if (i == 2) { 2255 vextracti128_high(wtmp, wsrc); 2256 } else { // i = [0,1] 2257 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2258 } 2259 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2260 wsrc = wdst; 2261 vlen_enc = Assembler::AVX_128bit; 2262 } 2263 if (is_dst_valid) { 2264 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2265 } 2266 } 2267 2268 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2269 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2270 XMMRegister xmm_0, XMMRegister xmm_1) { 2271 XMMRegister wsrc = src; 2272 XMMRegister wdst = xmm_0; 2273 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2274 int vlen_enc = Assembler::AVX_128bit; 2275 if (vlen == 8) { 2276 vlen_enc = Assembler::AVX_256bit; 2277 } 2278 for (int i = log2(vlen) - 1; i >=0; i--) { 2279 if (i == 0 && !is_dst_valid) { 2280 wdst = dst; 2281 } 2282 if (i == 1) { 2283 vextracti128_high(wtmp, wsrc); 2284 } else if (i == 2) { 2285 vextracti64x4_high(wtmp, wsrc); 2286 } else { 2287 assert(i == 0, "%d", i); 2288 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2289 } 2290 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2291 wsrc = wdst; 2292 vlen_enc = Assembler::AVX_128bit; 2293 } 2294 if (is_dst_valid) { 2295 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2296 } 2297 } 2298 2299 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2300 switch (bt) { 2301 case T_BYTE: pextrb(dst, src, idx); break; 2302 case T_SHORT: pextrw(dst, src, idx); break; 2303 case T_INT: pextrd(dst, src, idx); break; 2304 case T_LONG: pextrq(dst, src, idx); break; 2305 2306 default: 2307 assert(false,"Should not reach here."); 2308 break; 2309 } 2310 } 2311 2312 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2313 int esize = type2aelembytes(typ); 2314 int elem_per_lane = 16/esize; 2315 int lane = elemindex / elem_per_lane; 2316 int eindex = elemindex % elem_per_lane; 2317 2318 if (lane >= 2) { 2319 assert(UseAVX > 2, "required"); 2320 vextractf32x4(dst, src, lane & 3); 2321 return dst; 2322 } else if (lane > 0) { 2323 assert(UseAVX > 0, "required"); 2324 vextractf128(dst, src, lane); 2325 return dst; 2326 } else { 2327 return src; 2328 } 2329 } 2330 2331 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2332 if (typ == T_BYTE) { 2333 movsbl(dst, dst); 2334 } else if (typ == T_SHORT) { 2335 movswl(dst, dst); 2336 } 2337 } 2338 2339 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2340 int esize = type2aelembytes(typ); 2341 int elem_per_lane = 16/esize; 2342 int eindex = elemindex % elem_per_lane; 2343 assert(is_integral_type(typ),"required"); 2344 2345 if (eindex == 0) { 2346 if (typ == T_LONG) { 2347 movq(dst, src); 2348 } else { 2349 movdl(dst, src); 2350 movsxl(typ, dst); 2351 } 2352 } else { 2353 extract(typ, dst, src, eindex); 2354 movsxl(typ, dst); 2355 } 2356 } 2357 2358 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2359 int esize = type2aelembytes(typ); 2360 int elem_per_lane = 16/esize; 2361 int eindex = elemindex % elem_per_lane; 2362 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2363 2364 if (eindex == 0) { 2365 movq(dst, src); 2366 } else { 2367 if (typ == T_FLOAT) { 2368 if (UseAVX == 0) { 2369 movdqu(dst, src); 2370 shufps(dst, dst, eindex); 2371 } else { 2372 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2373 } 2374 } else { 2375 if (UseAVX == 0) { 2376 movdqu(dst, src); 2377 psrldq(dst, eindex*esize); 2378 } else { 2379 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2380 } 2381 movq(dst, dst); 2382 } 2383 } 2384 // Zero upper bits 2385 if (typ == T_FLOAT) { 2386 if (UseAVX == 0) { 2387 assert(vtmp != xnoreg, "required."); 2388 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2389 pand(dst, vtmp); 2390 } else { 2391 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2392 } 2393 } 2394 } 2395 2396 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2397 switch(typ) { 2398 case T_BYTE: 2399 case T_BOOLEAN: 2400 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2401 break; 2402 case T_SHORT: 2403 case T_CHAR: 2404 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2405 break; 2406 case T_INT: 2407 case T_FLOAT: 2408 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2409 break; 2410 case T_LONG: 2411 case T_DOUBLE: 2412 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2413 break; 2414 default: 2415 assert(false,"Should not reach here."); 2416 break; 2417 } 2418 } 2419 2420 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2421 assert(rscratch != noreg || always_reachable(src2), "missing"); 2422 2423 switch(typ) { 2424 case T_BOOLEAN: 2425 case T_BYTE: 2426 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2427 break; 2428 case T_CHAR: 2429 case T_SHORT: 2430 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2431 break; 2432 case T_INT: 2433 case T_FLOAT: 2434 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2435 break; 2436 case T_LONG: 2437 case T_DOUBLE: 2438 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2439 break; 2440 default: 2441 assert(false,"Should not reach here."); 2442 break; 2443 } 2444 } 2445 2446 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2447 switch(typ) { 2448 case T_BYTE: 2449 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2450 break; 2451 case T_SHORT: 2452 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2453 break; 2454 case T_INT: 2455 case T_FLOAT: 2456 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2457 break; 2458 case T_LONG: 2459 case T_DOUBLE: 2460 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2461 break; 2462 default: 2463 assert(false,"Should not reach here."); 2464 break; 2465 } 2466 } 2467 2468 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2469 assert(vlen_in_bytes <= 32, ""); 2470 int esize = type2aelembytes(bt); 2471 if (vlen_in_bytes == 32) { 2472 assert(vtmp == xnoreg, "required."); 2473 if (esize >= 4) { 2474 vtestps(src1, src2, AVX_256bit); 2475 } else { 2476 vptest(src1, src2, AVX_256bit); 2477 } 2478 return; 2479 } 2480 if (vlen_in_bytes < 16) { 2481 // Duplicate the lower part to fill the whole register, 2482 // Don't need to do so for src2 2483 assert(vtmp != xnoreg, "required"); 2484 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2485 pshufd(vtmp, src1, shuffle_imm); 2486 } else { 2487 assert(vtmp == xnoreg, "required"); 2488 vtmp = src1; 2489 } 2490 if (esize >= 4 && VM_Version::supports_avx()) { 2491 vtestps(vtmp, src2, AVX_128bit); 2492 } else { 2493 ptest(vtmp, src2); 2494 } 2495 } 2496 2497 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2498 assert(UseAVX >= 2, "required"); 2499 #ifdef ASSERT 2500 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2501 bool is_bw_supported = VM_Version::supports_avx512bw(); 2502 if (is_bw && !is_bw_supported) { 2503 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2504 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2505 "XMM register should be 0-15"); 2506 } 2507 #endif // ASSERT 2508 switch (elem_bt) { 2509 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2510 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2511 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2512 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2513 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2514 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2515 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2516 } 2517 } 2518 2519 #ifdef _LP64 2520 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2521 assert(UseAVX >= 2, "required"); 2522 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2523 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2524 if ((UseAVX > 2) && 2525 (!is_bw || VM_Version::supports_avx512bw()) && 2526 (!is_vl || VM_Version::supports_avx512vl())) { 2527 switch (elem_bt) { 2528 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2529 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2530 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2531 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2532 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2533 } 2534 } else { 2535 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2536 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2537 switch (elem_bt) { 2538 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2539 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2540 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2541 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2542 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2543 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2544 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2545 } 2546 } 2547 } 2548 #endif 2549 2550 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2551 switch (to_elem_bt) { 2552 case T_SHORT: 2553 vpmovsxbw(dst, src, vlen_enc); 2554 break; 2555 case T_INT: 2556 vpmovsxbd(dst, src, vlen_enc); 2557 break; 2558 case T_FLOAT: 2559 vpmovsxbd(dst, src, vlen_enc); 2560 vcvtdq2ps(dst, dst, vlen_enc); 2561 break; 2562 case T_LONG: 2563 vpmovsxbq(dst, src, vlen_enc); 2564 break; 2565 case T_DOUBLE: { 2566 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2567 vpmovsxbd(dst, src, mid_vlen_enc); 2568 vcvtdq2pd(dst, dst, vlen_enc); 2569 break; 2570 } 2571 default: 2572 fatal("Unsupported type %s", type2name(to_elem_bt)); 2573 break; 2574 } 2575 } 2576 2577 //------------------------------------------------------------------------------------------- 2578 2579 // IndexOf for constant substrings with size >= 8 chars 2580 // which don't need to be loaded through stack. 2581 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2582 Register cnt1, Register cnt2, 2583 int int_cnt2, Register result, 2584 XMMRegister vec, Register tmp, 2585 int ae) { 2586 ShortBranchVerifier sbv(this); 2587 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2588 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2589 2590 // This method uses the pcmpestri instruction with bound registers 2591 // inputs: 2592 // xmm - substring 2593 // rax - substring length (elements count) 2594 // mem - scanned string 2595 // rdx - string length (elements count) 2596 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2597 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2598 // outputs: 2599 // rcx - matched index in string 2600 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2601 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2602 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2603 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2604 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2605 2606 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2607 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2608 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2609 2610 // Note, inline_string_indexOf() generates checks: 2611 // if (substr.count > string.count) return -1; 2612 // if (substr.count == 0) return 0; 2613 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2614 2615 // Load substring. 2616 if (ae == StrIntrinsicNode::UL) { 2617 pmovzxbw(vec, Address(str2, 0)); 2618 } else { 2619 movdqu(vec, Address(str2, 0)); 2620 } 2621 movl(cnt2, int_cnt2); 2622 movptr(result, str1); // string addr 2623 2624 if (int_cnt2 > stride) { 2625 jmpb(SCAN_TO_SUBSTR); 2626 2627 // Reload substr for rescan, this code 2628 // is executed only for large substrings (> 8 chars) 2629 bind(RELOAD_SUBSTR); 2630 if (ae == StrIntrinsicNode::UL) { 2631 pmovzxbw(vec, Address(str2, 0)); 2632 } else { 2633 movdqu(vec, Address(str2, 0)); 2634 } 2635 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2636 2637 bind(RELOAD_STR); 2638 // We came here after the beginning of the substring was 2639 // matched but the rest of it was not so we need to search 2640 // again. Start from the next element after the previous match. 2641 2642 // cnt2 is number of substring reminding elements and 2643 // cnt1 is number of string reminding elements when cmp failed. 2644 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2645 subl(cnt1, cnt2); 2646 addl(cnt1, int_cnt2); 2647 movl(cnt2, int_cnt2); // Now restore cnt2 2648 2649 decrementl(cnt1); // Shift to next element 2650 cmpl(cnt1, cnt2); 2651 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2652 2653 addptr(result, (1<<scale1)); 2654 2655 } // (int_cnt2 > 8) 2656 2657 // Scan string for start of substr in 16-byte vectors 2658 bind(SCAN_TO_SUBSTR); 2659 pcmpestri(vec, Address(result, 0), mode); 2660 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2661 subl(cnt1, stride); 2662 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2663 cmpl(cnt1, cnt2); 2664 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2665 addptr(result, 16); 2666 jmpb(SCAN_TO_SUBSTR); 2667 2668 // Found a potential substr 2669 bind(FOUND_CANDIDATE); 2670 // Matched whole vector if first element matched (tmp(rcx) == 0). 2671 if (int_cnt2 == stride) { 2672 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2673 } else { // int_cnt2 > 8 2674 jccb(Assembler::overflow, FOUND_SUBSTR); 2675 } 2676 // After pcmpestri tmp(rcx) contains matched element index 2677 // Compute start addr of substr 2678 lea(result, Address(result, tmp, scale1)); 2679 2680 // Make sure string is still long enough 2681 subl(cnt1, tmp); 2682 cmpl(cnt1, cnt2); 2683 if (int_cnt2 == stride) { 2684 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2685 } else { // int_cnt2 > 8 2686 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2687 } 2688 // Left less then substring. 2689 2690 bind(RET_NOT_FOUND); 2691 movl(result, -1); 2692 jmp(EXIT); 2693 2694 if (int_cnt2 > stride) { 2695 // This code is optimized for the case when whole substring 2696 // is matched if its head is matched. 2697 bind(MATCH_SUBSTR_HEAD); 2698 pcmpestri(vec, Address(result, 0), mode); 2699 // Reload only string if does not match 2700 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2701 2702 Label CONT_SCAN_SUBSTR; 2703 // Compare the rest of substring (> 8 chars). 2704 bind(FOUND_SUBSTR); 2705 // First 8 chars are already matched. 2706 negptr(cnt2); 2707 addptr(cnt2, stride); 2708 2709 bind(SCAN_SUBSTR); 2710 subl(cnt1, stride); 2711 cmpl(cnt2, -stride); // Do not read beyond substring 2712 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2713 // Back-up strings to avoid reading beyond substring: 2714 // cnt1 = cnt1 - cnt2 + 8 2715 addl(cnt1, cnt2); // cnt2 is negative 2716 addl(cnt1, stride); 2717 movl(cnt2, stride); negptr(cnt2); 2718 bind(CONT_SCAN_SUBSTR); 2719 if (int_cnt2 < (int)G) { 2720 int tail_off1 = int_cnt2<<scale1; 2721 int tail_off2 = int_cnt2<<scale2; 2722 if (ae == StrIntrinsicNode::UL) { 2723 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2724 } else { 2725 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2726 } 2727 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2728 } else { 2729 // calculate index in register to avoid integer overflow (int_cnt2*2) 2730 movl(tmp, int_cnt2); 2731 addptr(tmp, cnt2); 2732 if (ae == StrIntrinsicNode::UL) { 2733 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2734 } else { 2735 movdqu(vec, Address(str2, tmp, scale2, 0)); 2736 } 2737 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2738 } 2739 // Need to reload strings pointers if not matched whole vector 2740 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2741 addptr(cnt2, stride); 2742 jcc(Assembler::negative, SCAN_SUBSTR); 2743 // Fall through if found full substring 2744 2745 } // (int_cnt2 > 8) 2746 2747 bind(RET_FOUND); 2748 // Found result if we matched full small substring. 2749 // Compute substr offset 2750 subptr(result, str1); 2751 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2752 shrl(result, 1); // index 2753 } 2754 bind(EXIT); 2755 2756 } // string_indexofC8 2757 2758 // Small strings are loaded through stack if they cross page boundary. 2759 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2760 Register cnt1, Register cnt2, 2761 int int_cnt2, Register result, 2762 XMMRegister vec, Register tmp, 2763 int ae) { 2764 ShortBranchVerifier sbv(this); 2765 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2766 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2767 2768 // 2769 // int_cnt2 is length of small (< 8 chars) constant substring 2770 // or (-1) for non constant substring in which case its length 2771 // is in cnt2 register. 2772 // 2773 // Note, inline_string_indexOf() generates checks: 2774 // if (substr.count > string.count) return -1; 2775 // if (substr.count == 0) return 0; 2776 // 2777 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2778 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2779 // This method uses the pcmpestri instruction with bound registers 2780 // inputs: 2781 // xmm - substring 2782 // rax - substring length (elements count) 2783 // mem - scanned string 2784 // rdx - string length (elements count) 2785 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2786 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2787 // outputs: 2788 // rcx - matched index in string 2789 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2790 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2791 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2792 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2793 2794 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2795 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2796 FOUND_CANDIDATE; 2797 2798 { //======================================================== 2799 // We don't know where these strings are located 2800 // and we can't read beyond them. Load them through stack. 2801 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2802 2803 movptr(tmp, rsp); // save old SP 2804 2805 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2806 if (int_cnt2 == (1>>scale2)) { // One byte 2807 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2808 load_unsigned_byte(result, Address(str2, 0)); 2809 movdl(vec, result); // move 32 bits 2810 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2811 // Not enough header space in 32-bit VM: 12+3 = 15. 2812 movl(result, Address(str2, -1)); 2813 shrl(result, 8); 2814 movdl(vec, result); // move 32 bits 2815 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2816 load_unsigned_short(result, Address(str2, 0)); 2817 movdl(vec, result); // move 32 bits 2818 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2819 movdl(vec, Address(str2, 0)); // move 32 bits 2820 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2821 movq(vec, Address(str2, 0)); // move 64 bits 2822 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2823 // Array header size is 12 bytes in 32-bit VM 2824 // + 6 bytes for 3 chars == 18 bytes, 2825 // enough space to load vec and shift. 2826 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2827 if (ae == StrIntrinsicNode::UL) { 2828 int tail_off = int_cnt2-8; 2829 pmovzxbw(vec, Address(str2, tail_off)); 2830 psrldq(vec, -2*tail_off); 2831 } 2832 else { 2833 int tail_off = int_cnt2*(1<<scale2); 2834 movdqu(vec, Address(str2, tail_off-16)); 2835 psrldq(vec, 16-tail_off); 2836 } 2837 } 2838 } else { // not constant substring 2839 cmpl(cnt2, stride); 2840 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2841 2842 // We can read beyond string if srt+16 does not cross page boundary 2843 // since heaps are aligned and mapped by pages. 2844 assert(os::vm_page_size() < (int)G, "default page should be small"); 2845 movl(result, str2); // We need only low 32 bits 2846 andl(result, ((int)os::vm_page_size()-1)); 2847 cmpl(result, ((int)os::vm_page_size()-16)); 2848 jccb(Assembler::belowEqual, CHECK_STR); 2849 2850 // Move small strings to stack to allow load 16 bytes into vec. 2851 subptr(rsp, 16); 2852 int stk_offset = wordSize-(1<<scale2); 2853 push(cnt2); 2854 2855 bind(COPY_SUBSTR); 2856 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2857 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2858 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2859 } else if (ae == StrIntrinsicNode::UU) { 2860 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2861 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2862 } 2863 decrement(cnt2); 2864 jccb(Assembler::notZero, COPY_SUBSTR); 2865 2866 pop(cnt2); 2867 movptr(str2, rsp); // New substring address 2868 } // non constant 2869 2870 bind(CHECK_STR); 2871 cmpl(cnt1, stride); 2872 jccb(Assembler::aboveEqual, BIG_STRINGS); 2873 2874 // Check cross page boundary. 2875 movl(result, str1); // We need only low 32 bits 2876 andl(result, ((int)os::vm_page_size()-1)); 2877 cmpl(result, ((int)os::vm_page_size()-16)); 2878 jccb(Assembler::belowEqual, BIG_STRINGS); 2879 2880 subptr(rsp, 16); 2881 int stk_offset = -(1<<scale1); 2882 if (int_cnt2 < 0) { // not constant 2883 push(cnt2); 2884 stk_offset += wordSize; 2885 } 2886 movl(cnt2, cnt1); 2887 2888 bind(COPY_STR); 2889 if (ae == StrIntrinsicNode::LL) { 2890 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2891 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2892 } else { 2893 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2894 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2895 } 2896 decrement(cnt2); 2897 jccb(Assembler::notZero, COPY_STR); 2898 2899 if (int_cnt2 < 0) { // not constant 2900 pop(cnt2); 2901 } 2902 movptr(str1, rsp); // New string address 2903 2904 bind(BIG_STRINGS); 2905 // Load substring. 2906 if (int_cnt2 < 0) { // -1 2907 if (ae == StrIntrinsicNode::UL) { 2908 pmovzxbw(vec, Address(str2, 0)); 2909 } else { 2910 movdqu(vec, Address(str2, 0)); 2911 } 2912 push(cnt2); // substr count 2913 push(str2); // substr addr 2914 push(str1); // string addr 2915 } else { 2916 // Small (< 8 chars) constant substrings are loaded already. 2917 movl(cnt2, int_cnt2); 2918 } 2919 push(tmp); // original SP 2920 2921 } // Finished loading 2922 2923 //======================================================== 2924 // Start search 2925 // 2926 2927 movptr(result, str1); // string addr 2928 2929 if (int_cnt2 < 0) { // Only for non constant substring 2930 jmpb(SCAN_TO_SUBSTR); 2931 2932 // SP saved at sp+0 2933 // String saved at sp+1*wordSize 2934 // Substr saved at sp+2*wordSize 2935 // Substr count saved at sp+3*wordSize 2936 2937 // Reload substr for rescan, this code 2938 // is executed only for large substrings (> 8 chars) 2939 bind(RELOAD_SUBSTR); 2940 movptr(str2, Address(rsp, 2*wordSize)); 2941 movl(cnt2, Address(rsp, 3*wordSize)); 2942 if (ae == StrIntrinsicNode::UL) { 2943 pmovzxbw(vec, Address(str2, 0)); 2944 } else { 2945 movdqu(vec, Address(str2, 0)); 2946 } 2947 // We came here after the beginning of the substring was 2948 // matched but the rest of it was not so we need to search 2949 // again. Start from the next element after the previous match. 2950 subptr(str1, result); // Restore counter 2951 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2952 shrl(str1, 1); 2953 } 2954 addl(cnt1, str1); 2955 decrementl(cnt1); // Shift to next element 2956 cmpl(cnt1, cnt2); 2957 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2958 2959 addptr(result, (1<<scale1)); 2960 } // non constant 2961 2962 // Scan string for start of substr in 16-byte vectors 2963 bind(SCAN_TO_SUBSTR); 2964 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2965 pcmpestri(vec, Address(result, 0), mode); 2966 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2967 subl(cnt1, stride); 2968 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2969 cmpl(cnt1, cnt2); 2970 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2971 addptr(result, 16); 2972 2973 bind(ADJUST_STR); 2974 cmpl(cnt1, stride); // Do not read beyond string 2975 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2976 // Back-up string to avoid reading beyond string. 2977 lea(result, Address(result, cnt1, scale1, -16)); 2978 movl(cnt1, stride); 2979 jmpb(SCAN_TO_SUBSTR); 2980 2981 // Found a potential substr 2982 bind(FOUND_CANDIDATE); 2983 // After pcmpestri tmp(rcx) contains matched element index 2984 2985 // Make sure string is still long enough 2986 subl(cnt1, tmp); 2987 cmpl(cnt1, cnt2); 2988 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 2989 // Left less then substring. 2990 2991 bind(RET_NOT_FOUND); 2992 movl(result, -1); 2993 jmp(CLEANUP); 2994 2995 bind(FOUND_SUBSTR); 2996 // Compute start addr of substr 2997 lea(result, Address(result, tmp, scale1)); 2998 if (int_cnt2 > 0) { // Constant substring 2999 // Repeat search for small substring (< 8 chars) 3000 // from new point without reloading substring. 3001 // Have to check that we don't read beyond string. 3002 cmpl(tmp, stride-int_cnt2); 3003 jccb(Assembler::greater, ADJUST_STR); 3004 // Fall through if matched whole substring. 3005 } else { // non constant 3006 assert(int_cnt2 == -1, "should be != 0"); 3007 3008 addl(tmp, cnt2); 3009 // Found result if we matched whole substring. 3010 cmpl(tmp, stride); 3011 jcc(Assembler::lessEqual, RET_FOUND); 3012 3013 // Repeat search for small substring (<= 8 chars) 3014 // from new point 'str1' without reloading substring. 3015 cmpl(cnt2, stride); 3016 // Have to check that we don't read beyond string. 3017 jccb(Assembler::lessEqual, ADJUST_STR); 3018 3019 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3020 // Compare the rest of substring (> 8 chars). 3021 movptr(str1, result); 3022 3023 cmpl(tmp, cnt2); 3024 // First 8 chars are already matched. 3025 jccb(Assembler::equal, CHECK_NEXT); 3026 3027 bind(SCAN_SUBSTR); 3028 pcmpestri(vec, Address(str1, 0), mode); 3029 // Need to reload strings pointers if not matched whole vector 3030 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3031 3032 bind(CHECK_NEXT); 3033 subl(cnt2, stride); 3034 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3035 addptr(str1, 16); 3036 if (ae == StrIntrinsicNode::UL) { 3037 addptr(str2, 8); 3038 } else { 3039 addptr(str2, 16); 3040 } 3041 subl(cnt1, stride); 3042 cmpl(cnt2, stride); // Do not read beyond substring 3043 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3044 // Back-up strings to avoid reading beyond substring. 3045 3046 if (ae == StrIntrinsicNode::UL) { 3047 lea(str2, Address(str2, cnt2, scale2, -8)); 3048 lea(str1, Address(str1, cnt2, scale1, -16)); 3049 } else { 3050 lea(str2, Address(str2, cnt2, scale2, -16)); 3051 lea(str1, Address(str1, cnt2, scale1, -16)); 3052 } 3053 subl(cnt1, cnt2); 3054 movl(cnt2, stride); 3055 addl(cnt1, stride); 3056 bind(CONT_SCAN_SUBSTR); 3057 if (ae == StrIntrinsicNode::UL) { 3058 pmovzxbw(vec, Address(str2, 0)); 3059 } else { 3060 movdqu(vec, Address(str2, 0)); 3061 } 3062 jmp(SCAN_SUBSTR); 3063 3064 bind(RET_FOUND_LONG); 3065 movptr(str1, Address(rsp, wordSize)); 3066 } // non constant 3067 3068 bind(RET_FOUND); 3069 // Compute substr offset 3070 subptr(result, str1); 3071 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3072 shrl(result, 1); // index 3073 } 3074 bind(CLEANUP); 3075 pop(rsp); // restore SP 3076 3077 } // string_indexof 3078 3079 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3080 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3081 ShortBranchVerifier sbv(this); 3082 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3083 3084 int stride = 8; 3085 3086 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3087 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3088 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3089 FOUND_SEQ_CHAR, DONE_LABEL; 3090 3091 movptr(result, str1); 3092 if (UseAVX >= 2) { 3093 cmpl(cnt1, stride); 3094 jcc(Assembler::less, SCAN_TO_CHAR); 3095 cmpl(cnt1, 2*stride); 3096 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3097 movdl(vec1, ch); 3098 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3099 vpxor(vec2, vec2); 3100 movl(tmp, cnt1); 3101 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3102 andl(cnt1,0x0000000F); //tail count (in chars) 3103 3104 bind(SCAN_TO_16_CHAR_LOOP); 3105 vmovdqu(vec3, Address(result, 0)); 3106 vpcmpeqw(vec3, vec3, vec1, 1); 3107 vptest(vec2, vec3); 3108 jcc(Assembler::carryClear, FOUND_CHAR); 3109 addptr(result, 32); 3110 subl(tmp, 2*stride); 3111 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3112 jmp(SCAN_TO_8_CHAR); 3113 bind(SCAN_TO_8_CHAR_INIT); 3114 movdl(vec1, ch); 3115 pshuflw(vec1, vec1, 0x00); 3116 pshufd(vec1, vec1, 0); 3117 pxor(vec2, vec2); 3118 } 3119 bind(SCAN_TO_8_CHAR); 3120 cmpl(cnt1, stride); 3121 jcc(Assembler::less, SCAN_TO_CHAR); 3122 if (UseAVX < 2) { 3123 movdl(vec1, ch); 3124 pshuflw(vec1, vec1, 0x00); 3125 pshufd(vec1, vec1, 0); 3126 pxor(vec2, vec2); 3127 } 3128 movl(tmp, cnt1); 3129 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3130 andl(cnt1,0x00000007); //tail count (in chars) 3131 3132 bind(SCAN_TO_8_CHAR_LOOP); 3133 movdqu(vec3, Address(result, 0)); 3134 pcmpeqw(vec3, vec1); 3135 ptest(vec2, vec3); 3136 jcc(Assembler::carryClear, FOUND_CHAR); 3137 addptr(result, 16); 3138 subl(tmp, stride); 3139 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3140 bind(SCAN_TO_CHAR); 3141 testl(cnt1, cnt1); 3142 jcc(Assembler::zero, RET_NOT_FOUND); 3143 bind(SCAN_TO_CHAR_LOOP); 3144 load_unsigned_short(tmp, Address(result, 0)); 3145 cmpl(ch, tmp); 3146 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3147 addptr(result, 2); 3148 subl(cnt1, 1); 3149 jccb(Assembler::zero, RET_NOT_FOUND); 3150 jmp(SCAN_TO_CHAR_LOOP); 3151 3152 bind(RET_NOT_FOUND); 3153 movl(result, -1); 3154 jmpb(DONE_LABEL); 3155 3156 bind(FOUND_CHAR); 3157 if (UseAVX >= 2) { 3158 vpmovmskb(tmp, vec3); 3159 } else { 3160 pmovmskb(tmp, vec3); 3161 } 3162 bsfl(ch, tmp); 3163 addptr(result, ch); 3164 3165 bind(FOUND_SEQ_CHAR); 3166 subptr(result, str1); 3167 shrl(result, 1); 3168 3169 bind(DONE_LABEL); 3170 } // string_indexof_char 3171 3172 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3173 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3174 ShortBranchVerifier sbv(this); 3175 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3176 3177 int stride = 16; 3178 3179 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3180 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3181 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3182 FOUND_SEQ_CHAR, DONE_LABEL; 3183 3184 movptr(result, str1); 3185 if (UseAVX >= 2) { 3186 cmpl(cnt1, stride); 3187 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3188 cmpl(cnt1, stride*2); 3189 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3190 movdl(vec1, ch); 3191 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3192 vpxor(vec2, vec2); 3193 movl(tmp, cnt1); 3194 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3195 andl(cnt1,0x0000001F); //tail count (in chars) 3196 3197 bind(SCAN_TO_32_CHAR_LOOP); 3198 vmovdqu(vec3, Address(result, 0)); 3199 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3200 vptest(vec2, vec3); 3201 jcc(Assembler::carryClear, FOUND_CHAR); 3202 addptr(result, 32); 3203 subl(tmp, stride*2); 3204 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3205 jmp(SCAN_TO_16_CHAR); 3206 3207 bind(SCAN_TO_16_CHAR_INIT); 3208 movdl(vec1, ch); 3209 pxor(vec2, vec2); 3210 pshufb(vec1, vec2); 3211 } 3212 3213 bind(SCAN_TO_16_CHAR); 3214 cmpl(cnt1, stride); 3215 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3216 if (UseAVX < 2) { 3217 movdl(vec1, ch); 3218 pxor(vec2, vec2); 3219 pshufb(vec1, vec2); 3220 } 3221 movl(tmp, cnt1); 3222 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3223 andl(cnt1,0x0000000F); //tail count (in bytes) 3224 3225 bind(SCAN_TO_16_CHAR_LOOP); 3226 movdqu(vec3, Address(result, 0)); 3227 pcmpeqb(vec3, vec1); 3228 ptest(vec2, vec3); 3229 jcc(Assembler::carryClear, FOUND_CHAR); 3230 addptr(result, 16); 3231 subl(tmp, stride); 3232 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3233 3234 bind(SCAN_TO_CHAR_INIT); 3235 testl(cnt1, cnt1); 3236 jcc(Assembler::zero, RET_NOT_FOUND); 3237 bind(SCAN_TO_CHAR_LOOP); 3238 load_unsigned_byte(tmp, Address(result, 0)); 3239 cmpl(ch, tmp); 3240 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3241 addptr(result, 1); 3242 subl(cnt1, 1); 3243 jccb(Assembler::zero, RET_NOT_FOUND); 3244 jmp(SCAN_TO_CHAR_LOOP); 3245 3246 bind(RET_NOT_FOUND); 3247 movl(result, -1); 3248 jmpb(DONE_LABEL); 3249 3250 bind(FOUND_CHAR); 3251 if (UseAVX >= 2) { 3252 vpmovmskb(tmp, vec3); 3253 } else { 3254 pmovmskb(tmp, vec3); 3255 } 3256 bsfl(ch, tmp); 3257 addptr(result, ch); 3258 3259 bind(FOUND_SEQ_CHAR); 3260 subptr(result, str1); 3261 3262 bind(DONE_LABEL); 3263 } // stringL_indexof_char 3264 3265 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3266 switch (eltype) { 3267 case T_BOOLEAN: return sizeof(jboolean); 3268 case T_BYTE: return sizeof(jbyte); 3269 case T_SHORT: return sizeof(jshort); 3270 case T_CHAR: return sizeof(jchar); 3271 case T_INT: return sizeof(jint); 3272 default: 3273 ShouldNotReachHere(); 3274 return -1; 3275 } 3276 } 3277 3278 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3279 switch (eltype) { 3280 // T_BOOLEAN used as surrogate for unsigned byte 3281 case T_BOOLEAN: movzbl(dst, src); break; 3282 case T_BYTE: movsbl(dst, src); break; 3283 case T_SHORT: movswl(dst, src); break; 3284 case T_CHAR: movzwl(dst, src); break; 3285 case T_INT: movl(dst, src); break; 3286 default: 3287 ShouldNotReachHere(); 3288 } 3289 } 3290 3291 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3292 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3293 } 3294 3295 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3296 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3297 } 3298 3299 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3300 const int vlen = Assembler::AVX_256bit; 3301 switch (eltype) { 3302 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3303 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3304 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3305 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3306 case T_INT: 3307 // do nothing 3308 break; 3309 default: 3310 ShouldNotReachHere(); 3311 } 3312 } 3313 3314 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3315 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3316 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3317 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3318 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3319 BasicType eltype) { 3320 ShortBranchVerifier sbv(this); 3321 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3322 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3323 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3324 3325 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3326 SHORT_UNROLLED_LOOP_EXIT, 3327 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3328 UNROLLED_VECTOR_LOOP_BEGIN, 3329 END; 3330 switch (eltype) { 3331 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3332 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3333 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3334 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3335 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3336 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3337 } 3338 3339 // For "renaming" for readibility of the code 3340 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3341 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3342 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3343 3344 const int elsize = arrays_hashcode_elsize(eltype); 3345 3346 /* 3347 if (cnt1 >= 2) { 3348 if (cnt1 >= 32) { 3349 UNROLLED VECTOR LOOP 3350 } 3351 UNROLLED SCALAR LOOP 3352 } 3353 SINGLE SCALAR 3354 */ 3355 3356 cmpl(cnt1, 32); 3357 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3358 3359 // cnt1 >= 32 && generate_vectorized_loop 3360 xorl(index, index); 3361 3362 // vresult = IntVector.zero(I256); 3363 for (int idx = 0; idx < 4; idx++) { 3364 vpxor(vresult[idx], vresult[idx]); 3365 } 3366 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3367 Register bound = tmp2; 3368 Register next = tmp3; 3369 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3370 movl(next, Address(tmp2, 0)); 3371 movdl(vnext, next); 3372 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3373 3374 // index = 0; 3375 // bound = cnt1 & ~(32 - 1); 3376 movl(bound, cnt1); 3377 andl(bound, ~(32 - 1)); 3378 // for (; index < bound; index += 32) { 3379 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3380 // result *= next; 3381 imull(result, next); 3382 // loop fission to upfront the cost of fetching from memory, OOO execution 3383 // can then hopefully do a better job of prefetching 3384 for (int idx = 0; idx < 4; idx++) { 3385 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3386 } 3387 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3388 for (int idx = 0; idx < 4; idx++) { 3389 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3390 arrays_hashcode_elvcast(vtmp[idx], eltype); 3391 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3392 } 3393 // index += 32; 3394 addl(index, 32); 3395 // index < bound; 3396 cmpl(index, bound); 3397 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3398 // } 3399 3400 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3401 subl(cnt1, bound); 3402 // release bound 3403 3404 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3405 for (int idx = 0; idx < 4; idx++) { 3406 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3407 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3408 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3409 } 3410 // result += vresult.reduceLanes(ADD); 3411 for (int idx = 0; idx < 4; idx++) { 3412 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3413 } 3414 3415 // } else if (cnt1 < 32) { 3416 3417 bind(SHORT_UNROLLED_BEGIN); 3418 // int i = 1; 3419 movl(index, 1); 3420 cmpl(index, cnt1); 3421 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3422 3423 // for (; i < cnt1 ; i += 2) { 3424 bind(SHORT_UNROLLED_LOOP_BEGIN); 3425 movl(tmp3, 961); 3426 imull(result, tmp3); 3427 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3428 movl(tmp3, tmp2); 3429 shll(tmp3, 5); 3430 subl(tmp3, tmp2); 3431 addl(result, tmp3); 3432 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3433 addl(result, tmp3); 3434 addl(index, 2); 3435 cmpl(index, cnt1); 3436 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3437 3438 // } 3439 // if (i >= cnt1) { 3440 bind(SHORT_UNROLLED_LOOP_EXIT); 3441 jccb(Assembler::greater, END); 3442 movl(tmp2, result); 3443 shll(result, 5); 3444 subl(result, tmp2); 3445 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3446 addl(result, tmp3); 3447 // } 3448 bind(END); 3449 3450 BLOCK_COMMENT("} // arrays_hashcode"); 3451 3452 } // arrays_hashcode 3453 3454 // helper function for string_compare 3455 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3456 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3457 Address::ScaleFactor scale2, Register index, int ae) { 3458 if (ae == StrIntrinsicNode::LL) { 3459 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3460 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3461 } else if (ae == StrIntrinsicNode::UU) { 3462 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3463 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3464 } else { 3465 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3466 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3467 } 3468 } 3469 3470 // Compare strings, used for char[] and byte[]. 3471 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3472 Register cnt1, Register cnt2, Register result, 3473 XMMRegister vec1, int ae, KRegister mask) { 3474 ShortBranchVerifier sbv(this); 3475 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3476 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3477 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3478 int stride2x2 = 0x40; 3479 Address::ScaleFactor scale = Address::no_scale; 3480 Address::ScaleFactor scale1 = Address::no_scale; 3481 Address::ScaleFactor scale2 = Address::no_scale; 3482 3483 if (ae != StrIntrinsicNode::LL) { 3484 stride2x2 = 0x20; 3485 } 3486 3487 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3488 shrl(cnt2, 1); 3489 } 3490 // Compute the minimum of the string lengths and the 3491 // difference of the string lengths (stack). 3492 // Do the conditional move stuff 3493 movl(result, cnt1); 3494 subl(cnt1, cnt2); 3495 push(cnt1); 3496 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3497 3498 // Is the minimum length zero? 3499 testl(cnt2, cnt2); 3500 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3501 if (ae == StrIntrinsicNode::LL) { 3502 // Load first bytes 3503 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3504 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3505 } else if (ae == StrIntrinsicNode::UU) { 3506 // Load first characters 3507 load_unsigned_short(result, Address(str1, 0)); 3508 load_unsigned_short(cnt1, Address(str2, 0)); 3509 } else { 3510 load_unsigned_byte(result, Address(str1, 0)); 3511 load_unsigned_short(cnt1, Address(str2, 0)); 3512 } 3513 subl(result, cnt1); 3514 jcc(Assembler::notZero, POP_LABEL); 3515 3516 if (ae == StrIntrinsicNode::UU) { 3517 // Divide length by 2 to get number of chars 3518 shrl(cnt2, 1); 3519 } 3520 cmpl(cnt2, 1); 3521 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3522 3523 // Check if the strings start at the same location and setup scale and stride 3524 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3525 cmpptr(str1, str2); 3526 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3527 if (ae == StrIntrinsicNode::LL) { 3528 scale = Address::times_1; 3529 stride = 16; 3530 } else { 3531 scale = Address::times_2; 3532 stride = 8; 3533 } 3534 } else { 3535 scale1 = Address::times_1; 3536 scale2 = Address::times_2; 3537 // scale not used 3538 stride = 8; 3539 } 3540 3541 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3542 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3543 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3544 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3545 Label COMPARE_TAIL_LONG; 3546 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3547 3548 int pcmpmask = 0x19; 3549 if (ae == StrIntrinsicNode::LL) { 3550 pcmpmask &= ~0x01; 3551 } 3552 3553 // Setup to compare 16-chars (32-bytes) vectors, 3554 // start from first character again because it has aligned address. 3555 if (ae == StrIntrinsicNode::LL) { 3556 stride2 = 32; 3557 } else { 3558 stride2 = 16; 3559 } 3560 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3561 adr_stride = stride << scale; 3562 } else { 3563 adr_stride1 = 8; //stride << scale1; 3564 adr_stride2 = 16; //stride << scale2; 3565 } 3566 3567 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3568 // rax and rdx are used by pcmpestri as elements counters 3569 movl(result, cnt2); 3570 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3571 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3572 3573 // fast path : compare first 2 8-char vectors. 3574 bind(COMPARE_16_CHARS); 3575 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3576 movdqu(vec1, Address(str1, 0)); 3577 } else { 3578 pmovzxbw(vec1, Address(str1, 0)); 3579 } 3580 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3581 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3582 3583 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3584 movdqu(vec1, Address(str1, adr_stride)); 3585 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3586 } else { 3587 pmovzxbw(vec1, Address(str1, adr_stride1)); 3588 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3589 } 3590 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3591 addl(cnt1, stride); 3592 3593 // Compare the characters at index in cnt1 3594 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3595 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3596 subl(result, cnt2); 3597 jmp(POP_LABEL); 3598 3599 // Setup the registers to start vector comparison loop 3600 bind(COMPARE_WIDE_VECTORS); 3601 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3602 lea(str1, Address(str1, result, scale)); 3603 lea(str2, Address(str2, result, scale)); 3604 } else { 3605 lea(str1, Address(str1, result, scale1)); 3606 lea(str2, Address(str2, result, scale2)); 3607 } 3608 subl(result, stride2); 3609 subl(cnt2, stride2); 3610 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3611 negptr(result); 3612 3613 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3614 bind(COMPARE_WIDE_VECTORS_LOOP); 3615 3616 #ifdef _LP64 3617 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3618 cmpl(cnt2, stride2x2); 3619 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3620 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3621 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3622 3623 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3624 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3625 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3626 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3627 } else { 3628 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3629 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3630 } 3631 kortestql(mask, mask); 3632 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3633 addptr(result, stride2x2); // update since we already compared at this addr 3634 subl(cnt2, stride2x2); // and sub the size too 3635 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3636 3637 vpxor(vec1, vec1); 3638 jmpb(COMPARE_WIDE_TAIL); 3639 }//if (VM_Version::supports_avx512vlbw()) 3640 #endif // _LP64 3641 3642 3643 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3644 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3645 vmovdqu(vec1, Address(str1, result, scale)); 3646 vpxor(vec1, Address(str2, result, scale)); 3647 } else { 3648 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3649 vpxor(vec1, Address(str2, result, scale2)); 3650 } 3651 vptest(vec1, vec1); 3652 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3653 addptr(result, stride2); 3654 subl(cnt2, stride2); 3655 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3656 // clean upper bits of YMM registers 3657 vpxor(vec1, vec1); 3658 3659 // compare wide vectors tail 3660 bind(COMPARE_WIDE_TAIL); 3661 testptr(result, result); 3662 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3663 3664 movl(result, stride2); 3665 movl(cnt2, result); 3666 negptr(result); 3667 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3668 3669 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3670 bind(VECTOR_NOT_EQUAL); 3671 // clean upper bits of YMM registers 3672 vpxor(vec1, vec1); 3673 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3674 lea(str1, Address(str1, result, scale)); 3675 lea(str2, Address(str2, result, scale)); 3676 } else { 3677 lea(str1, Address(str1, result, scale1)); 3678 lea(str2, Address(str2, result, scale2)); 3679 } 3680 jmp(COMPARE_16_CHARS); 3681 3682 // Compare tail chars, length between 1 to 15 chars 3683 bind(COMPARE_TAIL_LONG); 3684 movl(cnt2, result); 3685 cmpl(cnt2, stride); 3686 jcc(Assembler::less, COMPARE_SMALL_STR); 3687 3688 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3689 movdqu(vec1, Address(str1, 0)); 3690 } else { 3691 pmovzxbw(vec1, Address(str1, 0)); 3692 } 3693 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3694 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3695 subptr(cnt2, stride); 3696 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3697 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3698 lea(str1, Address(str1, result, scale)); 3699 lea(str2, Address(str2, result, scale)); 3700 } else { 3701 lea(str1, Address(str1, result, scale1)); 3702 lea(str2, Address(str2, result, scale2)); 3703 } 3704 negptr(cnt2); 3705 jmpb(WHILE_HEAD_LABEL); 3706 3707 bind(COMPARE_SMALL_STR); 3708 } else if (UseSSE42Intrinsics) { 3709 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3710 int pcmpmask = 0x19; 3711 // Setup to compare 8-char (16-byte) vectors, 3712 // start from first character again because it has aligned address. 3713 movl(result, cnt2); 3714 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3715 if (ae == StrIntrinsicNode::LL) { 3716 pcmpmask &= ~0x01; 3717 } 3718 jcc(Assembler::zero, COMPARE_TAIL); 3719 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3720 lea(str1, Address(str1, result, scale)); 3721 lea(str2, Address(str2, result, scale)); 3722 } else { 3723 lea(str1, Address(str1, result, scale1)); 3724 lea(str2, Address(str2, result, scale2)); 3725 } 3726 negptr(result); 3727 3728 // pcmpestri 3729 // inputs: 3730 // vec1- substring 3731 // rax - negative string length (elements count) 3732 // mem - scanned string 3733 // rdx - string length (elements count) 3734 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3735 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3736 // outputs: 3737 // rcx - first mismatched element index 3738 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3739 3740 bind(COMPARE_WIDE_VECTORS); 3741 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3742 movdqu(vec1, Address(str1, result, scale)); 3743 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3744 } else { 3745 pmovzxbw(vec1, Address(str1, result, scale1)); 3746 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3747 } 3748 // After pcmpestri cnt1(rcx) contains mismatched element index 3749 3750 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3751 addptr(result, stride); 3752 subptr(cnt2, stride); 3753 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3754 3755 // compare wide vectors tail 3756 testptr(result, result); 3757 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3758 3759 movl(cnt2, stride); 3760 movl(result, stride); 3761 negptr(result); 3762 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3763 movdqu(vec1, Address(str1, result, scale)); 3764 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3765 } else { 3766 pmovzxbw(vec1, Address(str1, result, scale1)); 3767 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3768 } 3769 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3770 3771 // Mismatched characters in the vectors 3772 bind(VECTOR_NOT_EQUAL); 3773 addptr(cnt1, result); 3774 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3775 subl(result, cnt2); 3776 jmpb(POP_LABEL); 3777 3778 bind(COMPARE_TAIL); // limit is zero 3779 movl(cnt2, result); 3780 // Fallthru to tail compare 3781 } 3782 // Shift str2 and str1 to the end of the arrays, negate min 3783 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3784 lea(str1, Address(str1, cnt2, scale)); 3785 lea(str2, Address(str2, cnt2, scale)); 3786 } else { 3787 lea(str1, Address(str1, cnt2, scale1)); 3788 lea(str2, Address(str2, cnt2, scale2)); 3789 } 3790 decrementl(cnt2); // first character was compared already 3791 negptr(cnt2); 3792 3793 // Compare the rest of the elements 3794 bind(WHILE_HEAD_LABEL); 3795 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3796 subl(result, cnt1); 3797 jccb(Assembler::notZero, POP_LABEL); 3798 increment(cnt2); 3799 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3800 3801 // Strings are equal up to min length. Return the length difference. 3802 bind(LENGTH_DIFF_LABEL); 3803 pop(result); 3804 if (ae == StrIntrinsicNode::UU) { 3805 // Divide diff by 2 to get number of chars 3806 sarl(result, 1); 3807 } 3808 jmpb(DONE_LABEL); 3809 3810 #ifdef _LP64 3811 if (VM_Version::supports_avx512vlbw()) { 3812 3813 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3814 3815 kmovql(cnt1, mask); 3816 notq(cnt1); 3817 bsfq(cnt2, cnt1); 3818 if (ae != StrIntrinsicNode::LL) { 3819 // Divide diff by 2 to get number of chars 3820 sarl(cnt2, 1); 3821 } 3822 addq(result, cnt2); 3823 if (ae == StrIntrinsicNode::LL) { 3824 load_unsigned_byte(cnt1, Address(str2, result)); 3825 load_unsigned_byte(result, Address(str1, result)); 3826 } else if (ae == StrIntrinsicNode::UU) { 3827 load_unsigned_short(cnt1, Address(str2, result, scale)); 3828 load_unsigned_short(result, Address(str1, result, scale)); 3829 } else { 3830 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3831 load_unsigned_byte(result, Address(str1, result, scale1)); 3832 } 3833 subl(result, cnt1); 3834 jmpb(POP_LABEL); 3835 }//if (VM_Version::supports_avx512vlbw()) 3836 #endif // _LP64 3837 3838 // Discard the stored length difference 3839 bind(POP_LABEL); 3840 pop(cnt1); 3841 3842 // That's it 3843 bind(DONE_LABEL); 3844 if(ae == StrIntrinsicNode::UL) { 3845 negl(result); 3846 } 3847 3848 } 3849 3850 // Search for Non-ASCII character (Negative byte value) in a byte array, 3851 // return the index of the first such character, otherwise the length 3852 // of the array segment searched. 3853 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3854 // @IntrinsicCandidate 3855 // public static int countPositives(byte[] ba, int off, int len) { 3856 // for (int i = off; i < off + len; i++) { 3857 // if (ba[i] < 0) { 3858 // return i - off; 3859 // } 3860 // } 3861 // return len; 3862 // } 3863 void C2_MacroAssembler::count_positives(Register ary1, Register len, 3864 Register result, Register tmp1, 3865 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3866 // rsi: byte array 3867 // rcx: len 3868 // rax: result 3869 ShortBranchVerifier sbv(this); 3870 assert_different_registers(ary1, len, result, tmp1); 3871 assert_different_registers(vec1, vec2); 3872 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3873 3874 movl(result, len); // copy 3875 // len == 0 3876 testl(len, len); 3877 jcc(Assembler::zero, DONE); 3878 3879 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3880 VM_Version::supports_avx512vlbw() && 3881 VM_Version::supports_bmi2()) { 3882 3883 Label test_64_loop, test_tail, BREAK_LOOP; 3884 Register tmp3_aliased = len; 3885 3886 movl(tmp1, len); 3887 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3888 3889 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F 3890 andl(len, ~(64 - 1)); // vector count (in chars) 3891 jccb(Assembler::zero, test_tail); 3892 3893 lea(ary1, Address(ary1, len, Address::times_1)); 3894 negptr(len); 3895 3896 bind(test_64_loop); 3897 // Check whether our 64 elements of size byte contain negatives 3898 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3899 kortestql(mask1, mask1); 3900 jcc(Assembler::notZero, BREAK_LOOP); 3901 3902 addptr(len, 64); 3903 jccb(Assembler::notZero, test_64_loop); 3904 3905 bind(test_tail); 3906 // bail out when there is nothing to be done 3907 testl(tmp1, -1); 3908 jcc(Assembler::zero, DONE); 3909 3910 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3911 #ifdef _LP64 3912 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3913 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3914 notq(tmp3_aliased); 3915 kmovql(mask2, tmp3_aliased); 3916 #else 3917 Label k_init; 3918 jmp(k_init); 3919 3920 // We could not read 64-bits from a general purpose register thus we move 3921 // data required to compose 64 1's to the instruction stream 3922 // We emit 64 byte wide series of elements from 0..63 which later on would 3923 // be used as a compare targets with tail count contained in tmp1 register. 3924 // Result would be a k register having tmp1 consecutive number or 1 3925 // counting from least significant bit. 3926 address tmp = pc(); 3927 emit_int64(0x0706050403020100); 3928 emit_int64(0x0F0E0D0C0B0A0908); 3929 emit_int64(0x1716151413121110); 3930 emit_int64(0x1F1E1D1C1B1A1918); 3931 emit_int64(0x2726252423222120); 3932 emit_int64(0x2F2E2D2C2B2A2928); 3933 emit_int64(0x3736353433323130); 3934 emit_int64(0x3F3E3D3C3B3A3938); 3935 3936 bind(k_init); 3937 lea(len, InternalAddress(tmp)); 3938 // create mask to test for negative byte inside a vector 3939 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3940 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 3941 3942 #endif 3943 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3944 ktestq(mask1, mask2); 3945 jcc(Assembler::zero, DONE); 3946 3947 bind(BREAK_LOOP); 3948 // At least one byte in the last 64 bytes is negative. 3949 // Set up to look at the last 64 bytes as if they were a tail 3950 lea(ary1, Address(ary1, len, Address::times_1)); 3951 addptr(result, len); 3952 // Ignore the very last byte: if all others are positive, 3953 // it must be negative, so we can skip right to the 2+1 byte 3954 // end comparison at this point 3955 orl(result, 63); 3956 movl(len, 63); 3957 // Fallthru to tail compare 3958 } else { 3959 3960 if (UseAVX >= 2 && UseSSE >= 2) { 3961 // With AVX2, use 32-byte vector compare 3962 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 3963 3964 // Compare 32-byte vectors 3965 testl(len, 0xffffffe0); // vector count (in bytes) 3966 jccb(Assembler::zero, TAIL_START); 3967 3968 andl(len, 0xffffffe0); 3969 lea(ary1, Address(ary1, len, Address::times_1)); 3970 negptr(len); 3971 3972 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3973 movdl(vec2, tmp1); 3974 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 3975 3976 bind(COMPARE_WIDE_VECTORS); 3977 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 3978 vptest(vec1, vec2); 3979 jccb(Assembler::notZero, BREAK_LOOP); 3980 addptr(len, 32); 3981 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3982 3983 testl(result, 0x0000001f); // any bytes remaining? 3984 jcc(Assembler::zero, DONE); 3985 3986 // Quick test using the already prepared vector mask 3987 movl(len, result); 3988 andl(len, 0x0000001f); 3989 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 3990 vptest(vec1, vec2); 3991 jcc(Assembler::zero, DONE); 3992 // There are zeros, jump to the tail to determine exactly where 3993 jmpb(TAIL_START); 3994 3995 bind(BREAK_LOOP); 3996 // At least one byte in the last 32-byte vector is negative. 3997 // Set up to look at the last 32 bytes as if they were a tail 3998 lea(ary1, Address(ary1, len, Address::times_1)); 3999 addptr(result, len); 4000 // Ignore the very last byte: if all others are positive, 4001 // it must be negative, so we can skip right to the 2+1 byte 4002 // end comparison at this point 4003 orl(result, 31); 4004 movl(len, 31); 4005 // Fallthru to tail compare 4006 } else if (UseSSE42Intrinsics) { 4007 // With SSE4.2, use double quad vector compare 4008 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4009 4010 // Compare 16-byte vectors 4011 testl(len, 0xfffffff0); // vector count (in bytes) 4012 jcc(Assembler::zero, TAIL_START); 4013 4014 andl(len, 0xfffffff0); 4015 lea(ary1, Address(ary1, len, Address::times_1)); 4016 negptr(len); 4017 4018 movl(tmp1, 0x80808080); 4019 movdl(vec2, tmp1); 4020 pshufd(vec2, vec2, 0); 4021 4022 bind(COMPARE_WIDE_VECTORS); 4023 movdqu(vec1, Address(ary1, len, Address::times_1)); 4024 ptest(vec1, vec2); 4025 jccb(Assembler::notZero, BREAK_LOOP); 4026 addptr(len, 16); 4027 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4028 4029 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4030 jcc(Assembler::zero, DONE); 4031 4032 // Quick test using the already prepared vector mask 4033 movl(len, result); 4034 andl(len, 0x0000000f); // tail count (in bytes) 4035 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4036 ptest(vec1, vec2); 4037 jcc(Assembler::zero, DONE); 4038 jmpb(TAIL_START); 4039 4040 bind(BREAK_LOOP); 4041 // At least one byte in the last 16-byte vector is negative. 4042 // Set up and look at the last 16 bytes as if they were a tail 4043 lea(ary1, Address(ary1, len, Address::times_1)); 4044 addptr(result, len); 4045 // Ignore the very last byte: if all others are positive, 4046 // it must be negative, so we can skip right to the 2+1 byte 4047 // end comparison at this point 4048 orl(result, 15); 4049 movl(len, 15); 4050 // Fallthru to tail compare 4051 } 4052 } 4053 4054 bind(TAIL_START); 4055 // Compare 4-byte vectors 4056 andl(len, 0xfffffffc); // vector count (in bytes) 4057 jccb(Assembler::zero, COMPARE_CHAR); 4058 4059 lea(ary1, Address(ary1, len, Address::times_1)); 4060 negptr(len); 4061 4062 bind(COMPARE_VECTORS); 4063 movl(tmp1, Address(ary1, len, Address::times_1)); 4064 andl(tmp1, 0x80808080); 4065 jccb(Assembler::notZero, TAIL_ADJUST); 4066 addptr(len, 4); 4067 jccb(Assembler::notZero, COMPARE_VECTORS); 4068 4069 // Compare trailing char (final 2-3 bytes), if any 4070 bind(COMPARE_CHAR); 4071 4072 testl(result, 0x2); // tail char 4073 jccb(Assembler::zero, COMPARE_BYTE); 4074 load_unsigned_short(tmp1, Address(ary1, 0)); 4075 andl(tmp1, 0x00008080); 4076 jccb(Assembler::notZero, CHAR_ADJUST); 4077 lea(ary1, Address(ary1, 2)); 4078 4079 bind(COMPARE_BYTE); 4080 testl(result, 0x1); // tail byte 4081 jccb(Assembler::zero, DONE); 4082 load_unsigned_byte(tmp1, Address(ary1, 0)); 4083 testl(tmp1, 0x00000080); 4084 jccb(Assembler::zero, DONE); 4085 subptr(result, 1); 4086 jmpb(DONE); 4087 4088 bind(TAIL_ADJUST); 4089 // there are negative bits in the last 4 byte block. 4090 // Adjust result and check the next three bytes 4091 addptr(result, len); 4092 orl(result, 3); 4093 lea(ary1, Address(ary1, len, Address::times_1)); 4094 jmpb(COMPARE_CHAR); 4095 4096 bind(CHAR_ADJUST); 4097 // We are looking at a char + optional byte tail, and found that one 4098 // of the bytes in the char is negative. Adjust the result, check the 4099 // first byte and readjust if needed. 4100 andl(result, 0xfffffffc); 4101 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4102 jccb(Assembler::notZero, DONE); 4103 addptr(result, 1); 4104 4105 // That's it 4106 bind(DONE); 4107 if (UseAVX >= 2 && UseSSE >= 2) { 4108 // clean upper bits of YMM registers 4109 vpxor(vec1, vec1); 4110 vpxor(vec2, vec2); 4111 } 4112 } 4113 4114 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4115 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4116 Register limit, Register result, Register chr, 4117 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 4118 ShortBranchVerifier sbv(this); 4119 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4120 4121 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4122 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4123 4124 if (is_array_equ) { 4125 // Check the input args 4126 cmpoop(ary1, ary2); 4127 jcc(Assembler::equal, TRUE_LABEL); 4128 4129 // Need additional checks for arrays_equals. 4130 testptr(ary1, ary1); 4131 jcc(Assembler::zero, FALSE_LABEL); 4132 testptr(ary2, ary2); 4133 jcc(Assembler::zero, FALSE_LABEL); 4134 4135 // Check the lengths 4136 movl(limit, Address(ary1, length_offset)); 4137 cmpl(limit, Address(ary2, length_offset)); 4138 jcc(Assembler::notEqual, FALSE_LABEL); 4139 } 4140 4141 // count == 0 4142 testl(limit, limit); 4143 jcc(Assembler::zero, TRUE_LABEL); 4144 4145 if (is_array_equ) { 4146 // Load array address 4147 lea(ary1, Address(ary1, base_offset)); 4148 lea(ary2, Address(ary2, base_offset)); 4149 } 4150 4151 if (is_array_equ && is_char) { 4152 // arrays_equals when used for char[]. 4153 shll(limit, 1); // byte count != 0 4154 } 4155 movl(result, limit); // copy 4156 4157 if (UseAVX >= 2) { 4158 // With AVX2, use 32-byte vector compare 4159 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4160 4161 // Compare 32-byte vectors 4162 andl(result, 0x0000001f); // tail count (in bytes) 4163 andl(limit, 0xffffffe0); // vector count (in bytes) 4164 jcc(Assembler::zero, COMPARE_TAIL); 4165 4166 lea(ary1, Address(ary1, limit, Address::times_1)); 4167 lea(ary2, Address(ary2, limit, Address::times_1)); 4168 negptr(limit); 4169 4170 #ifdef _LP64 4171 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4172 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4173 4174 cmpl(limit, -64); 4175 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4176 4177 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4178 4179 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4180 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4181 kortestql(mask, mask); 4182 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4183 addptr(limit, 64); // update since we already compared at this addr 4184 cmpl(limit, -64); 4185 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4186 4187 // At this point we may still need to compare -limit+result bytes. 4188 // We could execute the next two instruction and just continue via non-wide path: 4189 // cmpl(limit, 0); 4190 // jcc(Assembler::equal, COMPARE_TAIL); // true 4191 // But since we stopped at the points ary{1,2}+limit which are 4192 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4193 // (|limit| <= 32 and result < 32), 4194 // we may just compare the last 64 bytes. 4195 // 4196 addptr(result, -64); // it is safe, bc we just came from this area 4197 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4198 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4199 kortestql(mask, mask); 4200 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4201 4202 jmp(TRUE_LABEL); 4203 4204 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4205 4206 }//if (VM_Version::supports_avx512vlbw()) 4207 #endif //_LP64 4208 bind(COMPARE_WIDE_VECTORS); 4209 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 4210 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4211 vpxor(vec1, vec2); 4212 4213 vptest(vec1, vec1); 4214 jcc(Assembler::notZero, FALSE_LABEL); 4215 addptr(limit, 32); 4216 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4217 4218 testl(result, result); 4219 jcc(Assembler::zero, TRUE_LABEL); 4220 4221 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 4222 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4223 vpxor(vec1, vec2); 4224 4225 vptest(vec1, vec1); 4226 jccb(Assembler::notZero, FALSE_LABEL); 4227 jmpb(TRUE_LABEL); 4228 4229 bind(COMPARE_TAIL); // limit is zero 4230 movl(limit, result); 4231 // Fallthru to tail compare 4232 } else if (UseSSE42Intrinsics) { 4233 // With SSE4.2, use double quad vector compare 4234 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4235 4236 // Compare 16-byte vectors 4237 andl(result, 0x0000000f); // tail count (in bytes) 4238 andl(limit, 0xfffffff0); // vector count (in bytes) 4239 jcc(Assembler::zero, COMPARE_TAIL); 4240 4241 lea(ary1, Address(ary1, limit, Address::times_1)); 4242 lea(ary2, Address(ary2, limit, Address::times_1)); 4243 negptr(limit); 4244 4245 bind(COMPARE_WIDE_VECTORS); 4246 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4247 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4248 pxor(vec1, vec2); 4249 4250 ptest(vec1, vec1); 4251 jcc(Assembler::notZero, FALSE_LABEL); 4252 addptr(limit, 16); 4253 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4254 4255 testl(result, result); 4256 jcc(Assembler::zero, TRUE_LABEL); 4257 4258 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4259 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4260 pxor(vec1, vec2); 4261 4262 ptest(vec1, vec1); 4263 jccb(Assembler::notZero, FALSE_LABEL); 4264 jmpb(TRUE_LABEL); 4265 4266 bind(COMPARE_TAIL); // limit is zero 4267 movl(limit, result); 4268 // Fallthru to tail compare 4269 } 4270 4271 // Compare 4-byte vectors 4272 andl(limit, 0xfffffffc); // vector count (in bytes) 4273 jccb(Assembler::zero, COMPARE_CHAR); 4274 4275 lea(ary1, Address(ary1, limit, Address::times_1)); 4276 lea(ary2, Address(ary2, limit, Address::times_1)); 4277 negptr(limit); 4278 4279 bind(COMPARE_VECTORS); 4280 movl(chr, Address(ary1, limit, Address::times_1)); 4281 cmpl(chr, Address(ary2, limit, Address::times_1)); 4282 jccb(Assembler::notEqual, FALSE_LABEL); 4283 addptr(limit, 4); 4284 jcc(Assembler::notZero, COMPARE_VECTORS); 4285 4286 // Compare trailing char (final 2 bytes), if any 4287 bind(COMPARE_CHAR); 4288 testl(result, 0x2); // tail char 4289 jccb(Assembler::zero, COMPARE_BYTE); 4290 load_unsigned_short(chr, Address(ary1, 0)); 4291 load_unsigned_short(limit, Address(ary2, 0)); 4292 cmpl(chr, limit); 4293 jccb(Assembler::notEqual, FALSE_LABEL); 4294 4295 if (is_array_equ && is_char) { 4296 bind(COMPARE_BYTE); 4297 } else { 4298 lea(ary1, Address(ary1, 2)); 4299 lea(ary2, Address(ary2, 2)); 4300 4301 bind(COMPARE_BYTE); 4302 testl(result, 0x1); // tail byte 4303 jccb(Assembler::zero, TRUE_LABEL); 4304 load_unsigned_byte(chr, Address(ary1, 0)); 4305 load_unsigned_byte(limit, Address(ary2, 0)); 4306 cmpl(chr, limit); 4307 jccb(Assembler::notEqual, FALSE_LABEL); 4308 } 4309 bind(TRUE_LABEL); 4310 movl(result, 1); // return true 4311 jmpb(DONE); 4312 4313 bind(FALSE_LABEL); 4314 xorl(result, result); // return false 4315 4316 // That's it 4317 bind(DONE); 4318 if (UseAVX >= 2) { 4319 // clean upper bits of YMM registers 4320 vpxor(vec1, vec1); 4321 vpxor(vec2, vec2); 4322 } 4323 } 4324 4325 #ifdef _LP64 4326 4327 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4328 #define __ masm. 4329 Register dst = stub.data<0>(); 4330 XMMRegister src = stub.data<1>(); 4331 address target = stub.data<2>(); 4332 __ bind(stub.entry()); 4333 __ subptr(rsp, 8); 4334 __ movdbl(Address(rsp), src); 4335 __ call(RuntimeAddress(target)); 4336 __ pop(dst); 4337 __ jmp(stub.continuation()); 4338 #undef __ 4339 } 4340 4341 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4342 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4343 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4344 4345 address slowpath_target; 4346 if (dst_bt == T_INT) { 4347 if (src_bt == T_FLOAT) { 4348 cvttss2sil(dst, src); 4349 cmpl(dst, 0x80000000); 4350 slowpath_target = StubRoutines::x86::f2i_fixup(); 4351 } else { 4352 cvttsd2sil(dst, src); 4353 cmpl(dst, 0x80000000); 4354 slowpath_target = StubRoutines::x86::d2i_fixup(); 4355 } 4356 } else { 4357 if (src_bt == T_FLOAT) { 4358 cvttss2siq(dst, src); 4359 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4360 slowpath_target = StubRoutines::x86::f2l_fixup(); 4361 } else { 4362 cvttsd2siq(dst, src); 4363 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4364 slowpath_target = StubRoutines::x86::d2l_fixup(); 4365 } 4366 } 4367 4368 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4369 jcc(Assembler::equal, stub->entry()); 4370 bind(stub->continuation()); 4371 } 4372 4373 #endif // _LP64 4374 4375 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4376 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4377 switch(ideal_opc) { 4378 case Op_LShiftVS: 4379 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4380 case Op_LShiftVI: 4381 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4382 case Op_LShiftVL: 4383 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4384 case Op_RShiftVS: 4385 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4386 case Op_RShiftVI: 4387 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4388 case Op_RShiftVL: 4389 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4390 case Op_URShiftVS: 4391 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4392 case Op_URShiftVI: 4393 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4394 case Op_URShiftVL: 4395 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4396 case Op_RotateRightV: 4397 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4398 case Op_RotateLeftV: 4399 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4400 default: 4401 fatal("Unsupported masked operation"); break; 4402 } 4403 } 4404 4405 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4406 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4407 bool is_varshift) { 4408 switch (ideal_opc) { 4409 case Op_AddVB: 4410 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4411 case Op_AddVS: 4412 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4413 case Op_AddVI: 4414 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4415 case Op_AddVL: 4416 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4417 case Op_AddVF: 4418 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4419 case Op_AddVD: 4420 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4421 case Op_SubVB: 4422 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4423 case Op_SubVS: 4424 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4425 case Op_SubVI: 4426 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4427 case Op_SubVL: 4428 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4429 case Op_SubVF: 4430 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4431 case Op_SubVD: 4432 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4433 case Op_MulVS: 4434 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4435 case Op_MulVI: 4436 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4437 case Op_MulVL: 4438 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4439 case Op_MulVF: 4440 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4441 case Op_MulVD: 4442 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4443 case Op_DivVF: 4444 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4445 case Op_DivVD: 4446 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4447 case Op_SqrtVF: 4448 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4449 case Op_SqrtVD: 4450 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4451 case Op_AbsVB: 4452 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4453 case Op_AbsVS: 4454 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4455 case Op_AbsVI: 4456 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4457 case Op_AbsVL: 4458 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4459 case Op_FmaVF: 4460 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4461 case Op_FmaVD: 4462 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4463 case Op_VectorRearrange: 4464 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4465 case Op_LShiftVS: 4466 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4467 case Op_LShiftVI: 4468 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4469 case Op_LShiftVL: 4470 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4471 case Op_RShiftVS: 4472 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4473 case Op_RShiftVI: 4474 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4475 case Op_RShiftVL: 4476 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4477 case Op_URShiftVS: 4478 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4479 case Op_URShiftVI: 4480 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4481 case Op_URShiftVL: 4482 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4483 case Op_RotateLeftV: 4484 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4485 case Op_RotateRightV: 4486 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4487 case Op_MaxV: 4488 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4489 case Op_MinV: 4490 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4491 case Op_XorV: 4492 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4493 case Op_OrV: 4494 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4495 case Op_AndV: 4496 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4497 default: 4498 fatal("Unsupported masked operation"); break; 4499 } 4500 } 4501 4502 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4503 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4504 switch (ideal_opc) { 4505 case Op_AddVB: 4506 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4507 case Op_AddVS: 4508 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4509 case Op_AddVI: 4510 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4511 case Op_AddVL: 4512 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4513 case Op_AddVF: 4514 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4515 case Op_AddVD: 4516 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4517 case Op_SubVB: 4518 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4519 case Op_SubVS: 4520 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4521 case Op_SubVI: 4522 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4523 case Op_SubVL: 4524 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4525 case Op_SubVF: 4526 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4527 case Op_SubVD: 4528 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4529 case Op_MulVS: 4530 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4531 case Op_MulVI: 4532 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4533 case Op_MulVL: 4534 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4535 case Op_MulVF: 4536 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4537 case Op_MulVD: 4538 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4539 case Op_DivVF: 4540 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4541 case Op_DivVD: 4542 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4543 case Op_FmaVF: 4544 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4545 case Op_FmaVD: 4546 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4547 case Op_MaxV: 4548 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4549 case Op_MinV: 4550 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4551 case Op_XorV: 4552 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4553 case Op_OrV: 4554 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4555 case Op_AndV: 4556 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4557 default: 4558 fatal("Unsupported masked operation"); break; 4559 } 4560 } 4561 4562 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4563 KRegister src1, KRegister src2) { 4564 BasicType etype = T_ILLEGAL; 4565 switch(mask_len) { 4566 case 2: 4567 case 4: 4568 case 8: etype = T_BYTE; break; 4569 case 16: etype = T_SHORT; break; 4570 case 32: etype = T_INT; break; 4571 case 64: etype = T_LONG; break; 4572 default: fatal("Unsupported type"); break; 4573 } 4574 assert(etype != T_ILLEGAL, ""); 4575 switch(ideal_opc) { 4576 case Op_AndVMask: 4577 kand(etype, dst, src1, src2); break; 4578 case Op_OrVMask: 4579 kor(etype, dst, src1, src2); break; 4580 case Op_XorVMask: 4581 kxor(etype, dst, src1, src2); break; 4582 default: 4583 fatal("Unsupported masked operation"); break; 4584 } 4585 } 4586 4587 /* 4588 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4589 * If src is NaN, the result is 0. 4590 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4591 * the result is equal to the value of Integer.MIN_VALUE. 4592 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4593 * the result is equal to the value of Integer.MAX_VALUE. 4594 */ 4595 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4596 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4597 Register rscratch, AddressLiteral float_sign_flip, 4598 int vec_enc) { 4599 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4600 Label done; 4601 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4602 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4603 vptest(xtmp2, xtmp2, vec_enc); 4604 jccb(Assembler::equal, done); 4605 4606 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4607 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4608 4609 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4610 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4611 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4612 4613 // Recompute the mask for remaining special value. 4614 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4615 // Extract SRC values corresponding to TRUE mask lanes. 4616 vpand(xtmp4, xtmp2, src, vec_enc); 4617 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4618 // values are set. 4619 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4620 4621 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4622 bind(done); 4623 } 4624 4625 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4626 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4627 Register rscratch, AddressLiteral float_sign_flip, 4628 int vec_enc) { 4629 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4630 Label done; 4631 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4632 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4633 kortestwl(ktmp1, ktmp1); 4634 jccb(Assembler::equal, done); 4635 4636 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4637 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4638 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4639 4640 kxorwl(ktmp1, ktmp1, ktmp2); 4641 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4642 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4643 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4644 bind(done); 4645 } 4646 4647 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4648 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4649 Register rscratch, AddressLiteral double_sign_flip, 4650 int vec_enc) { 4651 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4652 4653 Label done; 4654 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4655 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4656 kortestwl(ktmp1, ktmp1); 4657 jccb(Assembler::equal, done); 4658 4659 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4660 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4661 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4662 4663 kxorwl(ktmp1, ktmp1, ktmp2); 4664 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4665 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4666 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4667 bind(done); 4668 } 4669 4670 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4671 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4672 Register rscratch, AddressLiteral float_sign_flip, 4673 int vec_enc) { 4674 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4675 Label done; 4676 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4677 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4678 kortestwl(ktmp1, ktmp1); 4679 jccb(Assembler::equal, done); 4680 4681 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4682 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4683 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4684 4685 kxorwl(ktmp1, ktmp1, ktmp2); 4686 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4687 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4688 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4689 bind(done); 4690 } 4691 4692 /* 4693 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4694 * If src is NaN, the result is 0. 4695 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4696 * the result is equal to the value of Long.MIN_VALUE. 4697 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4698 * the result is equal to the value of Long.MAX_VALUE. 4699 */ 4700 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4701 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4702 Register rscratch, AddressLiteral double_sign_flip, 4703 int vec_enc) { 4704 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4705 4706 Label done; 4707 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4708 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4709 kortestwl(ktmp1, ktmp1); 4710 jccb(Assembler::equal, done); 4711 4712 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4713 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4714 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4715 4716 kxorwl(ktmp1, ktmp1, ktmp2); 4717 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4718 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4719 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4720 bind(done); 4721 } 4722 4723 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4724 XMMRegister xtmp, int index, int vec_enc) { 4725 assert(vec_enc < Assembler::AVX_512bit, ""); 4726 if (vec_enc == Assembler::AVX_256bit) { 4727 vextractf128_high(xtmp, src); 4728 vshufps(dst, src, xtmp, index, vec_enc); 4729 } else { 4730 vshufps(dst, src, zero, index, vec_enc); 4731 } 4732 } 4733 4734 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4735 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 4736 AddressLiteral float_sign_flip, int src_vec_enc) { 4737 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4738 4739 Label done; 4740 // Compare the destination lanes with float_sign_flip 4741 // value to get mask for all special values. 4742 movdqu(xtmp1, float_sign_flip, rscratch); 4743 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 4744 ptest(xtmp2, xtmp2); 4745 jccb(Assembler::equal, done); 4746 4747 // Flip float_sign_flip to get max integer value. 4748 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 4749 pxor(xtmp1, xtmp4); 4750 4751 // Set detination lanes corresponding to unordered source lanes as zero. 4752 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 4753 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 4754 4755 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4756 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4757 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 4758 4759 // Recompute the mask for remaining special value. 4760 pxor(xtmp2, xtmp3); 4761 // Extract mask corresponding to non-negative source lanes. 4762 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 4763 4764 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4765 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4766 pand(xtmp3, xtmp2); 4767 4768 // Replace destination lanes holding special value(0x80000000) with max int 4769 // if corresponding source lane holds a +ve value. 4770 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 4771 bind(done); 4772 } 4773 4774 4775 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 4776 XMMRegister xtmp, Register rscratch, int vec_enc) { 4777 switch(to_elem_bt) { 4778 case T_SHORT: 4779 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 4780 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 4781 vpackusdw(dst, dst, zero, vec_enc); 4782 if (vec_enc == Assembler::AVX_256bit) { 4783 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4784 } 4785 break; 4786 case T_BYTE: 4787 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 4788 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 4789 vpackusdw(dst, dst, zero, vec_enc); 4790 if (vec_enc == Assembler::AVX_256bit) { 4791 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4792 } 4793 vpackuswb(dst, dst, zero, vec_enc); 4794 break; 4795 default: assert(false, "%s", type2name(to_elem_bt)); 4796 } 4797 } 4798 4799 /* 4800 * Algorithm for vector D2L and F2I conversions:- 4801 * a) Perform vector D2L/F2I cast. 4802 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 4803 * It signifies that source value could be any of the special floating point 4804 * values(NaN,-Inf,Inf,Max,-Min). 4805 * c) Set destination to zero if source is NaN value. 4806 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 4807 */ 4808 4809 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4810 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4811 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4812 int to_elem_sz = type2aelembytes(to_elem_bt); 4813 assert(to_elem_sz <= 4, ""); 4814 vcvttps2dq(dst, src, vec_enc); 4815 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 4816 if (to_elem_sz < 4) { 4817 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4818 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 4819 } 4820 } 4821 4822 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4823 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 4824 Register rscratch, int vec_enc) { 4825 int to_elem_sz = type2aelembytes(to_elem_bt); 4826 assert(to_elem_sz <= 4, ""); 4827 vcvttps2dq(dst, src, vec_enc); 4828 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 4829 switch(to_elem_bt) { 4830 case T_INT: 4831 break; 4832 case T_SHORT: 4833 evpmovdw(dst, dst, vec_enc); 4834 break; 4835 case T_BYTE: 4836 evpmovdb(dst, dst, vec_enc); 4837 break; 4838 default: assert(false, "%s", type2name(to_elem_bt)); 4839 } 4840 } 4841 4842 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4843 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 4844 Register rscratch, int vec_enc) { 4845 evcvttps2qq(dst, src, vec_enc); 4846 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 4847 } 4848 4849 // Handling for downcasting from double to integer or sub-word types on AVX2. 4850 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4851 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 4852 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4853 int to_elem_sz = type2aelembytes(to_elem_bt); 4854 assert(to_elem_sz < 8, ""); 4855 vcvttpd2dq(dst, src, vec_enc); 4856 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 4857 float_sign_flip, vec_enc); 4858 if (to_elem_sz < 4) { 4859 // xtmp4 holds all zero lanes. 4860 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 4861 } 4862 } 4863 4864 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 4865 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 4866 KRegister ktmp2, AddressLiteral sign_flip, 4867 Register rscratch, int vec_enc) { 4868 if (VM_Version::supports_avx512dq()) { 4869 evcvttpd2qq(dst, src, vec_enc); 4870 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4871 switch(to_elem_bt) { 4872 case T_LONG: 4873 break; 4874 case T_INT: 4875 evpmovsqd(dst, dst, vec_enc); 4876 break; 4877 case T_SHORT: 4878 evpmovsqd(dst, dst, vec_enc); 4879 evpmovdw(dst, dst, vec_enc); 4880 break; 4881 case T_BYTE: 4882 evpmovsqd(dst, dst, vec_enc); 4883 evpmovdb(dst, dst, vec_enc); 4884 break; 4885 default: assert(false, "%s", type2name(to_elem_bt)); 4886 } 4887 } else { 4888 assert(type2aelembytes(to_elem_bt) <= 4, ""); 4889 vcvttpd2dq(dst, src, vec_enc); 4890 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4891 switch(to_elem_bt) { 4892 case T_INT: 4893 break; 4894 case T_SHORT: 4895 evpmovdw(dst, dst, vec_enc); 4896 break; 4897 case T_BYTE: 4898 evpmovdb(dst, dst, vec_enc); 4899 break; 4900 default: assert(false, "%s", type2name(to_elem_bt)); 4901 } 4902 } 4903 } 4904 4905 #ifdef _LP64 4906 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 4907 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4908 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 4909 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4910 // and re-instantiate original MXCSR.RC mode after that. 4911 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4912 4913 mov64(tmp, julong_cast(0.5L)); 4914 evpbroadcastq(xtmp1, tmp, vec_enc); 4915 vaddpd(xtmp1, src , xtmp1, vec_enc); 4916 evcvtpd2qq(dst, xtmp1, vec_enc); 4917 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 4918 double_sign_flip, vec_enc);; 4919 4920 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4921 } 4922 4923 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 4924 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4925 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 4926 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4927 // and re-instantiate original MXCSR.RC mode after that. 4928 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4929 4930 movl(tmp, jint_cast(0.5)); 4931 movq(xtmp1, tmp); 4932 vbroadcastss(xtmp1, xtmp1, vec_enc); 4933 vaddps(xtmp1, src , xtmp1, vec_enc); 4934 vcvtps2dq(dst, xtmp1, vec_enc); 4935 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 4936 float_sign_flip, vec_enc); 4937 4938 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4939 } 4940 4941 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 4942 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4943 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 4944 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4945 // and re-instantiate original MXCSR.RC mode after that. 4946 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4947 4948 movl(tmp, jint_cast(0.5)); 4949 movq(xtmp1, tmp); 4950 vbroadcastss(xtmp1, xtmp1, vec_enc); 4951 vaddps(xtmp1, src , xtmp1, vec_enc); 4952 vcvtps2dq(dst, xtmp1, vec_enc); 4953 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 4954 4955 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4956 } 4957 #endif // _LP64 4958 4959 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 4960 BasicType from_elem_bt, BasicType to_elem_bt) { 4961 switch (from_elem_bt) { 4962 case T_BYTE: 4963 switch (to_elem_bt) { 4964 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 4965 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 4966 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 4967 default: ShouldNotReachHere(); 4968 } 4969 break; 4970 case T_SHORT: 4971 switch (to_elem_bt) { 4972 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 4973 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 4974 default: ShouldNotReachHere(); 4975 } 4976 break; 4977 case T_INT: 4978 assert(to_elem_bt == T_LONG, ""); 4979 vpmovzxdq(dst, src, vlen_enc); 4980 break; 4981 default: 4982 ShouldNotReachHere(); 4983 } 4984 } 4985 4986 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 4987 BasicType from_elem_bt, BasicType to_elem_bt) { 4988 switch (from_elem_bt) { 4989 case T_BYTE: 4990 switch (to_elem_bt) { 4991 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 4992 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 4993 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 4994 default: ShouldNotReachHere(); 4995 } 4996 break; 4997 case T_SHORT: 4998 switch (to_elem_bt) { 4999 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5000 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5001 default: ShouldNotReachHere(); 5002 } 5003 break; 5004 case T_INT: 5005 assert(to_elem_bt == T_LONG, ""); 5006 vpmovsxdq(dst, src, vlen_enc); 5007 break; 5008 default: 5009 ShouldNotReachHere(); 5010 } 5011 } 5012 5013 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5014 BasicType dst_bt, BasicType src_bt, int vlen) { 5015 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5016 assert(vlen_enc != AVX_512bit, ""); 5017 5018 int dst_bt_size = type2aelembytes(dst_bt); 5019 int src_bt_size = type2aelembytes(src_bt); 5020 if (dst_bt_size > src_bt_size) { 5021 switch (dst_bt_size / src_bt_size) { 5022 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5023 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5024 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5025 default: ShouldNotReachHere(); 5026 } 5027 } else { 5028 assert(dst_bt_size < src_bt_size, ""); 5029 switch (src_bt_size / dst_bt_size) { 5030 case 2: { 5031 if (vlen_enc == AVX_128bit) { 5032 vpacksswb(dst, src, src, vlen_enc); 5033 } else { 5034 vpacksswb(dst, src, src, vlen_enc); 5035 vpermq(dst, dst, 0x08, vlen_enc); 5036 } 5037 break; 5038 } 5039 case 4: { 5040 if (vlen_enc == AVX_128bit) { 5041 vpackssdw(dst, src, src, vlen_enc); 5042 vpacksswb(dst, dst, dst, vlen_enc); 5043 } else { 5044 vpackssdw(dst, src, src, vlen_enc); 5045 vpermq(dst, dst, 0x08, vlen_enc); 5046 vpacksswb(dst, dst, dst, AVX_128bit); 5047 } 5048 break; 5049 } 5050 case 8: { 5051 if (vlen_enc == AVX_128bit) { 5052 vpshufd(dst, src, 0x08, vlen_enc); 5053 vpackssdw(dst, dst, dst, vlen_enc); 5054 vpacksswb(dst, dst, dst, vlen_enc); 5055 } else { 5056 vpshufd(dst, src, 0x08, vlen_enc); 5057 vpermq(dst, dst, 0x08, vlen_enc); 5058 vpackssdw(dst, dst, dst, AVX_128bit); 5059 vpacksswb(dst, dst, dst, AVX_128bit); 5060 } 5061 break; 5062 } 5063 default: ShouldNotReachHere(); 5064 } 5065 } 5066 } 5067 5068 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5069 bool merge, BasicType bt, int vlen_enc) { 5070 if (bt == T_INT) { 5071 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5072 } else { 5073 assert(bt == T_LONG, ""); 5074 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5075 } 5076 } 5077 5078 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5079 bool merge, BasicType bt, int vlen_enc) { 5080 if (bt == T_INT) { 5081 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5082 } else { 5083 assert(bt == T_LONG, ""); 5084 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5085 } 5086 } 5087 5088 #ifdef _LP64 5089 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5090 Register rtmp2, XMMRegister xtmp, int mask_len, 5091 int vec_enc) { 5092 int index = 0; 5093 int vindex = 0; 5094 mov64(rtmp1, 0x0101010101010101L); 5095 pdepq(rtmp1, src, rtmp1); 5096 if (mask_len > 8) { 5097 movq(rtmp2, src); 5098 vpxor(xtmp, xtmp, xtmp, vec_enc); 5099 movq(xtmp, rtmp1); 5100 } 5101 movq(dst, rtmp1); 5102 5103 mask_len -= 8; 5104 while (mask_len > 0) { 5105 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5106 index++; 5107 if ((index % 2) == 0) { 5108 pxor(xtmp, xtmp); 5109 } 5110 mov64(rtmp1, 0x0101010101010101L); 5111 shrq(rtmp2, 8); 5112 pdepq(rtmp1, rtmp2, rtmp1); 5113 pinsrq(xtmp, rtmp1, index % 2); 5114 vindex = index / 2; 5115 if (vindex) { 5116 // Write entire 16 byte vector when both 64 bit 5117 // lanes are update to save redundant instructions. 5118 if (index % 2) { 5119 vinsertf128(dst, dst, xtmp, vindex); 5120 } 5121 } else { 5122 vmovdqu(dst, xtmp); 5123 } 5124 mask_len -= 8; 5125 } 5126 } 5127 5128 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5129 switch(opc) { 5130 case Op_VectorMaskTrueCount: 5131 popcntq(dst, tmp); 5132 break; 5133 case Op_VectorMaskLastTrue: 5134 if (VM_Version::supports_lzcnt()) { 5135 lzcntq(tmp, tmp); 5136 movl(dst, 63); 5137 subl(dst, tmp); 5138 } else { 5139 movl(dst, -1); 5140 bsrq(tmp, tmp); 5141 cmov32(Assembler::notZero, dst, tmp); 5142 } 5143 break; 5144 case Op_VectorMaskFirstTrue: 5145 if (VM_Version::supports_bmi1()) { 5146 if (masklen < 32) { 5147 orl(tmp, 1 << masklen); 5148 tzcntl(dst, tmp); 5149 } else if (masklen == 32) { 5150 tzcntl(dst, tmp); 5151 } else { 5152 assert(masklen == 64, ""); 5153 tzcntq(dst, tmp); 5154 } 5155 } else { 5156 if (masklen < 32) { 5157 orl(tmp, 1 << masklen); 5158 bsfl(dst, tmp); 5159 } else { 5160 assert(masklen == 32 || masklen == 64, ""); 5161 movl(dst, masklen); 5162 if (masklen == 32) { 5163 bsfl(tmp, tmp); 5164 } else { 5165 bsfq(tmp, tmp); 5166 } 5167 cmov32(Assembler::notZero, dst, tmp); 5168 } 5169 } 5170 break; 5171 case Op_VectorMaskToLong: 5172 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5173 break; 5174 default: assert(false, "Unhandled mask operation"); 5175 } 5176 } 5177 5178 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5179 int masklen, int masksize, int vec_enc) { 5180 assert(VM_Version::supports_popcnt(), ""); 5181 5182 if(VM_Version::supports_avx512bw()) { 5183 kmovql(tmp, mask); 5184 } else { 5185 assert(masklen <= 16, ""); 5186 kmovwl(tmp, mask); 5187 } 5188 5189 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5190 // operations needs to be clipped. 5191 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5192 andq(tmp, (1 << masklen) - 1); 5193 } 5194 5195 vector_mask_operation_helper(opc, dst, tmp, masklen); 5196 } 5197 5198 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5199 Register tmp, int masklen, BasicType bt, int vec_enc) { 5200 assert(vec_enc == AVX_128bit && VM_Version::supports_avx() || 5201 vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), ""); 5202 assert(VM_Version::supports_popcnt(), ""); 5203 5204 bool need_clip = false; 5205 switch(bt) { 5206 case T_BOOLEAN: 5207 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5208 vpxor(xtmp, xtmp, xtmp, vec_enc); 5209 vpsubb(xtmp, xtmp, mask, vec_enc); 5210 vpmovmskb(tmp, xtmp, vec_enc); 5211 need_clip = masklen < 16; 5212 break; 5213 case T_BYTE: 5214 vpmovmskb(tmp, mask, vec_enc); 5215 need_clip = masklen < 16; 5216 break; 5217 case T_SHORT: 5218 vpacksswb(xtmp, mask, mask, vec_enc); 5219 if (masklen >= 16) { 5220 vpermpd(xtmp, xtmp, 8, vec_enc); 5221 } 5222 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5223 need_clip = masklen < 16; 5224 break; 5225 case T_INT: 5226 case T_FLOAT: 5227 vmovmskps(tmp, mask, vec_enc); 5228 need_clip = masklen < 4; 5229 break; 5230 case T_LONG: 5231 case T_DOUBLE: 5232 vmovmskpd(tmp, mask, vec_enc); 5233 need_clip = masklen < 2; 5234 break; 5235 default: assert(false, "Unhandled type, %s", type2name(bt)); 5236 } 5237 5238 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5239 // operations needs to be clipped. 5240 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5241 // need_clip implies masklen < 32 5242 andq(tmp, (1 << masklen) - 1); 5243 } 5244 5245 vector_mask_operation_helper(opc, dst, tmp, masklen); 5246 } 5247 5248 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5249 Register rtmp2, int mask_len) { 5250 kmov(rtmp1, src); 5251 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5252 mov64(rtmp2, -1L); 5253 pextq(rtmp2, rtmp2, rtmp1); 5254 kmov(dst, rtmp2); 5255 } 5256 5257 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5258 bool merge, BasicType bt, int vec_enc) { 5259 if (opcode == Op_CompressV) { 5260 switch(bt) { 5261 case T_BYTE: 5262 evpcompressb(dst, mask, src, merge, vec_enc); 5263 break; 5264 case T_CHAR: 5265 case T_SHORT: 5266 evpcompressw(dst, mask, src, merge, vec_enc); 5267 break; 5268 case T_INT: 5269 evpcompressd(dst, mask, src, merge, vec_enc); 5270 break; 5271 case T_FLOAT: 5272 evcompressps(dst, mask, src, merge, vec_enc); 5273 break; 5274 case T_LONG: 5275 evpcompressq(dst, mask, src, merge, vec_enc); 5276 break; 5277 case T_DOUBLE: 5278 evcompresspd(dst, mask, src, merge, vec_enc); 5279 break; 5280 default: 5281 fatal("Unsupported type %s", type2name(bt)); 5282 break; 5283 } 5284 } else { 5285 assert(opcode == Op_ExpandV, ""); 5286 switch(bt) { 5287 case T_BYTE: 5288 evpexpandb(dst, mask, src, merge, vec_enc); 5289 break; 5290 case T_CHAR: 5291 case T_SHORT: 5292 evpexpandw(dst, mask, src, merge, vec_enc); 5293 break; 5294 case T_INT: 5295 evpexpandd(dst, mask, src, merge, vec_enc); 5296 break; 5297 case T_FLOAT: 5298 evexpandps(dst, mask, src, merge, vec_enc); 5299 break; 5300 case T_LONG: 5301 evpexpandq(dst, mask, src, merge, vec_enc); 5302 break; 5303 case T_DOUBLE: 5304 evexpandpd(dst, mask, src, merge, vec_enc); 5305 break; 5306 default: 5307 fatal("Unsupported type %s", type2name(bt)); 5308 break; 5309 } 5310 } 5311 } 5312 #endif 5313 5314 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5315 KRegister ktmp1, int vec_enc) { 5316 if (opcode == Op_SignumVD) { 5317 vsubpd(dst, zero, one, vec_enc); 5318 // if src < 0 ? -1 : 1 5319 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5320 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5321 // if src == NaN, -0.0 or 0.0 return src. 5322 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5323 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5324 } else { 5325 assert(opcode == Op_SignumVF, ""); 5326 vsubps(dst, zero, one, vec_enc); 5327 // if src < 0 ? -1 : 1 5328 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5329 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5330 // if src == NaN, -0.0 or 0.0 return src. 5331 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5332 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5333 } 5334 } 5335 5336 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5337 XMMRegister xtmp1, int vec_enc) { 5338 if (opcode == Op_SignumVD) { 5339 vsubpd(dst, zero, one, vec_enc); 5340 // if src < 0 ? -1 : 1 5341 vblendvpd(dst, one, dst, src, vec_enc); 5342 // if src == NaN, -0.0 or 0.0 return src. 5343 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5344 vblendvpd(dst, dst, src, xtmp1, vec_enc); 5345 } else { 5346 assert(opcode == Op_SignumVF, ""); 5347 vsubps(dst, zero, one, vec_enc); 5348 // if src < 0 ? -1 : 1 5349 vblendvps(dst, one, dst, src, vec_enc); 5350 // if src == NaN, -0.0 or 0.0 return src. 5351 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5352 vblendvps(dst, dst, src, xtmp1, vec_enc); 5353 } 5354 } 5355 5356 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5357 if (VM_Version::supports_avx512bw()) { 5358 if (mask_len > 32) { 5359 kmovql(dst, src); 5360 } else { 5361 kmovdl(dst, src); 5362 if (mask_len != 32) { 5363 kshiftrdl(dst, dst, 32 - mask_len); 5364 } 5365 } 5366 } else { 5367 assert(mask_len <= 16, ""); 5368 kmovwl(dst, src); 5369 if (mask_len != 16) { 5370 kshiftrwl(dst, dst, 16 - mask_len); 5371 } 5372 } 5373 } 5374 5375 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5376 int lane_size = type2aelembytes(bt); 5377 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5378 if ((is_LP64 || lane_size < 8) && 5379 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5380 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5381 movptr(rtmp, imm32); 5382 switch(lane_size) { 5383 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5384 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5385 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5386 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5387 fatal("Unsupported lane size %d", lane_size); 5388 break; 5389 } 5390 } else { 5391 movptr(rtmp, imm32); 5392 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5393 switch(lane_size) { 5394 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5395 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5396 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5397 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5398 fatal("Unsupported lane size %d", lane_size); 5399 break; 5400 } 5401 } 5402 } 5403 5404 // 5405 // Following is lookup table based popcount computation algorithm:- 5406 // Index Bit set count 5407 // [ 0000 -> 0, 5408 // 0001 -> 1, 5409 // 0010 -> 1, 5410 // 0011 -> 2, 5411 // 0100 -> 1, 5412 // 0101 -> 2, 5413 // 0110 -> 2, 5414 // 0111 -> 3, 5415 // 1000 -> 1, 5416 // 1001 -> 2, 5417 // 1010 -> 3, 5418 // 1011 -> 3, 5419 // 1100 -> 2, 5420 // 1101 -> 3, 5421 // 1111 -> 4 ] 5422 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5423 // shuffle indices for lookup table access. 5424 // b. Right shift each byte of vector lane by 4 positions. 5425 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5426 // shuffle indices for lookup table access. 5427 // d. Add the bitset count of upper and lower 4 bits of each byte. 5428 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5429 // count of all the bytes of a quadword. 5430 // f. Perform step e. for upper 128bit vector lane. 5431 // g. Pack the bitset count of quadwords back to double word. 5432 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5433 5434 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5435 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5436 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5437 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5438 vpsrlw(dst, src, 4, vec_enc); 5439 vpand(dst, dst, xtmp1, vec_enc); 5440 vpand(xtmp1, src, xtmp1, vec_enc); 5441 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5442 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5443 vpshufb(dst, xtmp2, dst, vec_enc); 5444 vpaddb(dst, dst, xtmp1, vec_enc); 5445 } 5446 5447 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5448 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5449 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5450 // Following code is as per steps e,f,g and h of above algorithm. 5451 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5452 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5453 vpsadbw(dst, dst, xtmp2, vec_enc); 5454 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5455 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5456 vpackuswb(dst, xtmp1, dst, vec_enc); 5457 } 5458 5459 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5460 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5461 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5462 // Add the popcount of upper and lower bytes of word. 5463 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5464 vpsrlw(dst, xtmp1, 8, vec_enc); 5465 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5466 vpaddw(dst, dst, xtmp1, vec_enc); 5467 } 5468 5469 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5470 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5471 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5472 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5473 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5474 } 5475 5476 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5477 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5478 switch(bt) { 5479 case T_LONG: 5480 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5481 break; 5482 case T_INT: 5483 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5484 break; 5485 case T_CHAR: 5486 case T_SHORT: 5487 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5488 break; 5489 case T_BYTE: 5490 case T_BOOLEAN: 5491 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5492 break; 5493 default: 5494 fatal("Unsupported type %s", type2name(bt)); 5495 break; 5496 } 5497 } 5498 5499 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5500 KRegister mask, bool merge, int vec_enc) { 5501 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5502 switch(bt) { 5503 case T_LONG: 5504 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5505 evpopcntq(dst, mask, src, merge, vec_enc); 5506 break; 5507 case T_INT: 5508 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5509 evpopcntd(dst, mask, src, merge, vec_enc); 5510 break; 5511 case T_CHAR: 5512 case T_SHORT: 5513 assert(VM_Version::supports_avx512_bitalg(), ""); 5514 evpopcntw(dst, mask, src, merge, vec_enc); 5515 break; 5516 case T_BYTE: 5517 case T_BOOLEAN: 5518 assert(VM_Version::supports_avx512_bitalg(), ""); 5519 evpopcntb(dst, mask, src, merge, vec_enc); 5520 break; 5521 default: 5522 fatal("Unsupported type %s", type2name(bt)); 5523 break; 5524 } 5525 } 5526 5527 #ifndef _LP64 5528 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5529 assert(VM_Version::supports_avx512bw(), ""); 5530 kmovdl(tmp, src); 5531 kunpckdql(dst, tmp, tmp); 5532 } 5533 #endif 5534 5535 // Bit reversal algorithm first reverses the bits of each byte followed by 5536 // a byte level reversal for multi-byte primitive types (short/int/long). 5537 // Algorithm performs a lookup table access to get reverse bit sequence 5538 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5539 // is obtained by swapping the reverse bit sequences of upper and lower 5540 // nibble of a byte. 5541 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5542 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5543 if (VM_Version::supports_avx512vlbw()) { 5544 5545 // Get the reverse bit sequence of lower nibble of each byte. 5546 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5547 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5548 evpandq(dst, xtmp2, src, vec_enc); 5549 vpshufb(dst, xtmp1, dst, vec_enc); 5550 vpsllq(dst, dst, 4, vec_enc); 5551 5552 // Get the reverse bit sequence of upper nibble of each byte. 5553 vpandn(xtmp2, xtmp2, src, vec_enc); 5554 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5555 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5556 5557 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5558 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5559 evporq(xtmp2, dst, xtmp2, vec_enc); 5560 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5561 5562 } else if(vec_enc == Assembler::AVX_512bit) { 5563 // Shift based bit reversal. 5564 assert(bt == T_LONG || bt == T_INT, ""); 5565 5566 // Swap lower and upper nibble of each byte. 5567 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5568 5569 // Swap two least and most significant bits of each nibble. 5570 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5571 5572 // Swap adjacent pair of bits. 5573 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5574 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5575 5576 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5577 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5578 } else { 5579 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5580 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5581 5582 // Get the reverse bit sequence of lower nibble of each byte. 5583 vpand(dst, xtmp2, src, vec_enc); 5584 vpshufb(dst, xtmp1, dst, vec_enc); 5585 vpsllq(dst, dst, 4, vec_enc); 5586 5587 // Get the reverse bit sequence of upper nibble of each byte. 5588 vpandn(xtmp2, xtmp2, src, vec_enc); 5589 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5590 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5591 5592 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5593 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5594 vpor(xtmp2, dst, xtmp2, vec_enc); 5595 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5596 } 5597 } 5598 5599 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5600 XMMRegister xtmp, Register rscratch) { 5601 assert(VM_Version::supports_gfni(), ""); 5602 assert(rscratch != noreg || always_reachable(mask), "missing"); 5603 5604 // Galois field instruction based bit reversal based on following algorithm. 5605 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5606 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5607 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5608 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5609 } 5610 5611 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5612 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5613 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5614 evpandq(dst, xtmp1, src, vec_enc); 5615 vpsllq(dst, dst, nbits, vec_enc); 5616 vpandn(xtmp1, xtmp1, src, vec_enc); 5617 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5618 evporq(dst, dst, xtmp1, vec_enc); 5619 } 5620 5621 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5622 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5623 // Shift based bit reversal. 5624 assert(VM_Version::supports_evex(), ""); 5625 switch(bt) { 5626 case T_LONG: 5627 // Swap upper and lower double word of each quad word. 5628 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5629 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5630 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5631 break; 5632 case T_INT: 5633 // Swap upper and lower word of each double word. 5634 evprord(xtmp1, k0, src, 16, true, vec_enc); 5635 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5636 break; 5637 case T_CHAR: 5638 case T_SHORT: 5639 // Swap upper and lower byte of each word. 5640 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5641 break; 5642 case T_BYTE: 5643 evmovdquq(dst, k0, src, true, vec_enc); 5644 break; 5645 default: 5646 fatal("Unsupported type %s", type2name(bt)); 5647 break; 5648 } 5649 } 5650 5651 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5652 if (bt == T_BYTE) { 5653 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5654 evmovdquq(dst, k0, src, true, vec_enc); 5655 } else { 5656 vmovdqu(dst, src); 5657 } 5658 return; 5659 } 5660 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5661 // pre-computed shuffle indices. 5662 switch(bt) { 5663 case T_LONG: 5664 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5665 break; 5666 case T_INT: 5667 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5668 break; 5669 case T_CHAR: 5670 case T_SHORT: 5671 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5672 break; 5673 default: 5674 fatal("Unsupported type %s", type2name(bt)); 5675 break; 5676 } 5677 vpshufb(dst, src, dst, vec_enc); 5678 } 5679 5680 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5681 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5682 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5683 assert(is_integral_type(bt), ""); 5684 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5685 assert(VM_Version::supports_avx512cd(), ""); 5686 switch(bt) { 5687 case T_LONG: 5688 evplzcntq(dst, ktmp, src, merge, vec_enc); 5689 break; 5690 case T_INT: 5691 evplzcntd(dst, ktmp, src, merge, vec_enc); 5692 break; 5693 case T_SHORT: 5694 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 5695 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 5696 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 5697 vpunpckhwd(dst, xtmp1, src, vec_enc); 5698 evplzcntd(dst, ktmp, dst, merge, vec_enc); 5699 vpackusdw(dst, xtmp2, dst, vec_enc); 5700 break; 5701 case T_BYTE: 5702 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5703 // accessing the lookup table. 5704 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5705 // accessing the lookup table. 5706 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5707 assert(VM_Version::supports_avx512bw(), ""); 5708 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 5709 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 5710 vpand(xtmp2, dst, src, vec_enc); 5711 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5712 vpsrlw(xtmp3, src, 4, vec_enc); 5713 vpand(xtmp3, dst, xtmp3, vec_enc); 5714 vpshufb(dst, xtmp1, xtmp3, vec_enc); 5715 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5716 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 5717 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 5718 break; 5719 default: 5720 fatal("Unsupported type %s", type2name(bt)); 5721 break; 5722 } 5723 } 5724 5725 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5726 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5727 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 5728 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5729 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5730 // accessing the lookup table. 5731 vpand(dst, xtmp2, src, vec_enc); 5732 vpshufb(dst, xtmp1, dst, vec_enc); 5733 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5734 // accessing the lookup table. 5735 vpsrlw(xtmp3, src, 4, vec_enc); 5736 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 5737 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 5738 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5739 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5740 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 5741 vpaddb(dst, dst, xtmp2, vec_enc); 5742 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 5743 } 5744 5745 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5746 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5747 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5748 // Add zero counts of lower byte and upper byte of a word if 5749 // upper byte holds a zero value. 5750 vpsrlw(xtmp3, src, 8, vec_enc); 5751 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5752 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 5753 vpsllw(xtmp2, dst, 8, vec_enc); 5754 vpaddw(xtmp2, xtmp2, dst, vec_enc); 5755 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5756 vpsrlw(dst, dst, 8, vec_enc); 5757 } 5758 5759 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5760 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 5761 // Since IEEE 754 floating point format represents mantissa in 1.0 format 5762 // hence biased exponent can be used to compute leading zero count as per 5763 // following formula:- 5764 // LZCNT = 32 - (biased_exp - 127) 5765 // Special handling has been introduced for Zero, Max_Int and -ve source values. 5766 5767 // Broadcast 0xFF 5768 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 5769 vpsrld(xtmp1, xtmp1, 24, vec_enc); 5770 5771 // Extract biased exponent. 5772 vcvtdq2ps(dst, src, vec_enc); 5773 vpsrld(dst, dst, 23, vec_enc); 5774 vpand(dst, dst, xtmp1, vec_enc); 5775 5776 // Broadcast 127. 5777 vpsrld(xtmp1, xtmp1, 1, vec_enc); 5778 // Exponent = biased_exp - 127 5779 vpsubd(dst, dst, xtmp1, vec_enc); 5780 5781 // Exponent = Exponent + 1 5782 vpsrld(xtmp3, xtmp1, 6, vec_enc); 5783 vpaddd(dst, dst, xtmp3, vec_enc); 5784 5785 // Replace -ve exponent with zero, exponent is -ve when src 5786 // lane contains a zero value. 5787 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5788 vblendvps(dst, dst, xtmp2, dst, vec_enc); 5789 5790 // Rematerialize broadcast 32. 5791 vpslld(xtmp1, xtmp3, 5, vec_enc); 5792 // Exponent is 32 if corresponding source lane contains max_int value. 5793 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5794 // LZCNT = 32 - exponent 5795 vpsubd(dst, xtmp1, dst, vec_enc); 5796 5797 // Replace LZCNT with a value 1 if corresponding source lane 5798 // contains max_int value. 5799 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 5800 5801 // Replace biased_exp with 0 if source lane value is less than zero. 5802 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5803 vblendvps(dst, dst, xtmp2, src, vec_enc); 5804 } 5805 5806 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5807 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5808 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5809 // Add zero counts of lower word and upper word of a double word if 5810 // upper word holds a zero value. 5811 vpsrld(xtmp3, src, 16, vec_enc); 5812 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5813 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 5814 vpslld(xtmp2, dst, 16, vec_enc); 5815 vpaddd(xtmp2, xtmp2, dst, vec_enc); 5816 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5817 vpsrld(dst, dst, 16, vec_enc); 5818 // Add zero counts of lower doubleword and upper doubleword of a 5819 // quadword if upper doubleword holds a zero value. 5820 vpsrlq(xtmp3, src, 32, vec_enc); 5821 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 5822 vpsllq(xtmp2, dst, 32, vec_enc); 5823 vpaddq(xtmp2, xtmp2, dst, vec_enc); 5824 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5825 vpsrlq(dst, dst, 32, vec_enc); 5826 } 5827 5828 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 5829 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5830 Register rtmp, int vec_enc) { 5831 assert(is_integral_type(bt), "unexpected type"); 5832 assert(vec_enc < Assembler::AVX_512bit, ""); 5833 switch(bt) { 5834 case T_LONG: 5835 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5836 break; 5837 case T_INT: 5838 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 5839 break; 5840 case T_SHORT: 5841 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5842 break; 5843 case T_BYTE: 5844 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5845 break; 5846 default: 5847 fatal("Unsupported type %s", type2name(bt)); 5848 break; 5849 } 5850 } 5851 5852 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 5853 switch(bt) { 5854 case T_BYTE: 5855 vpsubb(dst, src1, src2, vec_enc); 5856 break; 5857 case T_SHORT: 5858 vpsubw(dst, src1, src2, vec_enc); 5859 break; 5860 case T_INT: 5861 vpsubd(dst, src1, src2, vec_enc); 5862 break; 5863 case T_LONG: 5864 vpsubq(dst, src1, src2, vec_enc); 5865 break; 5866 default: 5867 fatal("Unsupported type %s", type2name(bt)); 5868 break; 5869 } 5870 } 5871 5872 // Trailing zero count computation is based on leading zero count operation as per 5873 // following equation. All AVX3 targets support AVX512CD feature which offers 5874 // direct vector instruction to compute leading zero count. 5875 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 5876 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5877 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5878 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 5879 assert(is_integral_type(bt), ""); 5880 // xtmp = -1 5881 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 5882 // xtmp = xtmp + src 5883 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 5884 // xtmp = xtmp & ~src 5885 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 5886 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 5887 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 5888 vpsub(bt, dst, xtmp4, dst, vec_enc); 5889 } 5890 5891 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 5892 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 5893 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5894 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5895 assert(is_integral_type(bt), ""); 5896 // xtmp = 0 5897 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 5898 // xtmp = 0 - src 5899 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 5900 // xtmp = xtmp | src 5901 vpor(xtmp3, xtmp3, src, vec_enc); 5902 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 5903 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 5904 vpsub(bt, dst, xtmp1, dst, vec_enc); 5905 } 5906 5907 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 5908 Label done; 5909 Label neg_divisor_fastpath; 5910 cmpl(divisor, 0); 5911 jccb(Assembler::less, neg_divisor_fastpath); 5912 xorl(rdx, rdx); 5913 divl(divisor); 5914 jmpb(done); 5915 bind(neg_divisor_fastpath); 5916 // Fastpath for divisor < 0: 5917 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 5918 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 5919 movl(rdx, rax); 5920 subl(rdx, divisor); 5921 if (VM_Version::supports_bmi1()) { 5922 andnl(rax, rdx, rax); 5923 } else { 5924 notl(rdx); 5925 andl(rax, rdx); 5926 } 5927 shrl(rax, 31); 5928 bind(done); 5929 } 5930 5931 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 5932 Label done; 5933 Label neg_divisor_fastpath; 5934 cmpl(divisor, 0); 5935 jccb(Assembler::less, neg_divisor_fastpath); 5936 xorl(rdx, rdx); 5937 divl(divisor); 5938 jmpb(done); 5939 bind(neg_divisor_fastpath); 5940 // Fastpath when divisor < 0: 5941 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 5942 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 5943 movl(rdx, rax); 5944 subl(rax, divisor); 5945 if (VM_Version::supports_bmi1()) { 5946 andnl(rax, rax, rdx); 5947 } else { 5948 notl(rax); 5949 andl(rax, rdx); 5950 } 5951 sarl(rax, 31); 5952 andl(rax, divisor); 5953 subl(rdx, rax); 5954 bind(done); 5955 } 5956 5957 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 5958 Label done; 5959 Label neg_divisor_fastpath; 5960 5961 cmpl(divisor, 0); 5962 jccb(Assembler::less, neg_divisor_fastpath); 5963 xorl(rdx, rdx); 5964 divl(divisor); 5965 jmpb(done); 5966 bind(neg_divisor_fastpath); 5967 // Fastpath for divisor < 0: 5968 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 5969 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 5970 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 5971 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 5972 movl(rdx, rax); 5973 subl(rax, divisor); 5974 if (VM_Version::supports_bmi1()) { 5975 andnl(rax, rax, rdx); 5976 } else { 5977 notl(rax); 5978 andl(rax, rdx); 5979 } 5980 movl(tmp, rax); 5981 shrl(rax, 31); // quotient 5982 sarl(tmp, 31); 5983 andl(tmp, divisor); 5984 subl(rdx, tmp); // remainder 5985 bind(done); 5986 } 5987 5988 #ifdef _LP64 5989 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 5990 XMMRegister xtmp2, Register rtmp) { 5991 if(VM_Version::supports_gfni()) { 5992 // Galois field instruction based bit reversal based on following algorithm. 5993 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5994 mov64(rtmp, 0x8040201008040201L); 5995 movq(xtmp1, src); 5996 movq(xtmp2, rtmp); 5997 gf2p8affineqb(xtmp1, xtmp2, 0); 5998 movq(dst, xtmp1); 5999 } else { 6000 // Swap even and odd numbered bits. 6001 movl(rtmp, src); 6002 andl(rtmp, 0x55555555); 6003 shll(rtmp, 1); 6004 movl(dst, src); 6005 andl(dst, 0xAAAAAAAA); 6006 shrl(dst, 1); 6007 orl(dst, rtmp); 6008 6009 // Swap LSB and MSB 2 bits of each nibble. 6010 movl(rtmp, dst); 6011 andl(rtmp, 0x33333333); 6012 shll(rtmp, 2); 6013 andl(dst, 0xCCCCCCCC); 6014 shrl(dst, 2); 6015 orl(dst, rtmp); 6016 6017 // Swap LSB and MSB 4 bits of each byte. 6018 movl(rtmp, dst); 6019 andl(rtmp, 0x0F0F0F0F); 6020 shll(rtmp, 4); 6021 andl(dst, 0xF0F0F0F0); 6022 shrl(dst, 4); 6023 orl(dst, rtmp); 6024 } 6025 bswapl(dst); 6026 } 6027 6028 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6029 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6030 if(VM_Version::supports_gfni()) { 6031 // Galois field instruction based bit reversal based on following algorithm. 6032 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6033 mov64(rtmp1, 0x8040201008040201L); 6034 movq(xtmp1, src); 6035 movq(xtmp2, rtmp1); 6036 gf2p8affineqb(xtmp1, xtmp2, 0); 6037 movq(dst, xtmp1); 6038 } else { 6039 // Swap even and odd numbered bits. 6040 movq(rtmp1, src); 6041 mov64(rtmp2, 0x5555555555555555L); 6042 andq(rtmp1, rtmp2); 6043 shlq(rtmp1, 1); 6044 movq(dst, src); 6045 notq(rtmp2); 6046 andq(dst, rtmp2); 6047 shrq(dst, 1); 6048 orq(dst, rtmp1); 6049 6050 // Swap LSB and MSB 2 bits of each nibble. 6051 movq(rtmp1, dst); 6052 mov64(rtmp2, 0x3333333333333333L); 6053 andq(rtmp1, rtmp2); 6054 shlq(rtmp1, 2); 6055 notq(rtmp2); 6056 andq(dst, rtmp2); 6057 shrq(dst, 2); 6058 orq(dst, rtmp1); 6059 6060 // Swap LSB and MSB 4 bits of each byte. 6061 movq(rtmp1, dst); 6062 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6063 andq(rtmp1, rtmp2); 6064 shlq(rtmp1, 4); 6065 notq(rtmp2); 6066 andq(dst, rtmp2); 6067 shrq(dst, 4); 6068 orq(dst, rtmp1); 6069 } 6070 bswapq(dst); 6071 } 6072 6073 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6074 Label done; 6075 Label neg_divisor_fastpath; 6076 cmpq(divisor, 0); 6077 jccb(Assembler::less, neg_divisor_fastpath); 6078 xorl(rdx, rdx); 6079 divq(divisor); 6080 jmpb(done); 6081 bind(neg_divisor_fastpath); 6082 // Fastpath for divisor < 0: 6083 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6084 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6085 movq(rdx, rax); 6086 subq(rdx, divisor); 6087 if (VM_Version::supports_bmi1()) { 6088 andnq(rax, rdx, rax); 6089 } else { 6090 notq(rdx); 6091 andq(rax, rdx); 6092 } 6093 shrq(rax, 63); 6094 bind(done); 6095 } 6096 6097 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6098 Label done; 6099 Label neg_divisor_fastpath; 6100 cmpq(divisor, 0); 6101 jccb(Assembler::less, neg_divisor_fastpath); 6102 xorq(rdx, rdx); 6103 divq(divisor); 6104 jmp(done); 6105 bind(neg_divisor_fastpath); 6106 // Fastpath when divisor < 0: 6107 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6108 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6109 movq(rdx, rax); 6110 subq(rax, divisor); 6111 if (VM_Version::supports_bmi1()) { 6112 andnq(rax, rax, rdx); 6113 } else { 6114 notq(rax); 6115 andq(rax, rdx); 6116 } 6117 sarq(rax, 63); 6118 andq(rax, divisor); 6119 subq(rdx, rax); 6120 bind(done); 6121 } 6122 6123 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6124 Label done; 6125 Label neg_divisor_fastpath; 6126 cmpq(divisor, 0); 6127 jccb(Assembler::less, neg_divisor_fastpath); 6128 xorq(rdx, rdx); 6129 divq(divisor); 6130 jmp(done); 6131 bind(neg_divisor_fastpath); 6132 // Fastpath for divisor < 0: 6133 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6134 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6135 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6136 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6137 movq(rdx, rax); 6138 subq(rax, divisor); 6139 if (VM_Version::supports_bmi1()) { 6140 andnq(rax, rax, rdx); 6141 } else { 6142 notq(rax); 6143 andq(rax, rdx); 6144 } 6145 movq(tmp, rax); 6146 shrq(rax, 63); // quotient 6147 sarq(tmp, 63); 6148 andq(tmp, divisor); 6149 subq(rdx, tmp); // remainder 6150 bind(done); 6151 } 6152 #endif 6153 6154 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6155 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6156 int vlen_enc) { 6157 assert(VM_Version::supports_avx512bw(), ""); 6158 // Byte shuffles are inlane operations and indices are determined using 6159 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6160 // normalized to index range 0-15. This makes sure that all the multiples 6161 // of an index value are placed at same relative position in 128 bit 6162 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6163 // will be 16th element in their respective 128 bit lanes. 6164 movl(rtmp, 16); 6165 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6166 6167 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6168 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6169 // original shuffle indices and move the shuffled lanes corresponding to true 6170 // mask to destination vector. 6171 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6172 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6173 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6174 6175 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6176 // and broadcasting second 128 bit lane. 6177 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6178 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6179 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6180 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6181 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6182 6183 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6184 // and broadcasting third 128 bit lane. 6185 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6186 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6187 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6188 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6189 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6190 6191 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6192 // and broadcasting third 128 bit lane. 6193 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6194 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6195 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6196 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6197 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6198 } 6199 6200 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6201 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6202 if (vlen_enc == AVX_128bit) { 6203 vpermilps(dst, src, shuffle, vlen_enc); 6204 } else if (bt == T_INT) { 6205 vpermd(dst, shuffle, src, vlen_enc); 6206 } else { 6207 assert(bt == T_FLOAT, ""); 6208 vpermps(dst, shuffle, src, vlen_enc); 6209 } 6210 }