1 /* 2 * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "oops/methodData.hpp" 29 #include "opto/c2_MacroAssembler.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/opcodes.hpp" 32 #include "opto/subnode.hpp" 33 #include "runtime/objectMonitor.hpp" 34 #include "runtime/stubRoutines.hpp" 35 36 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 37 switch (vlen_in_bytes) { 38 case 4: // fall-through 39 case 8: // fall-through 40 case 16: return Assembler::AVX_128bit; 41 case 32: return Assembler::AVX_256bit; 42 case 64: return Assembler::AVX_512bit; 43 44 default: { 45 ShouldNotReachHere(); 46 return Assembler::AVX_NoVec; 47 } 48 } 49 } 50 51 void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) { 52 guarantee(PostLoopMultiversioning, "must be"); 53 Assembler::movl(dst, 1); 54 Assembler::shlxl(dst, dst, src); 55 Assembler::decl(dst); 56 Assembler::kmovdl(mask, dst); 57 Assembler::movl(dst, src); 58 } 59 60 void C2_MacroAssembler::restorevectmask(KRegister mask) { 61 guarantee(PostLoopMultiversioning, "must be"); 62 Assembler::knotwl(mask, k0); 63 } 64 65 #if INCLUDE_RTM_OPT 66 67 // Update rtm_counters based on abort status 68 // input: abort_status 69 // rtm_counters (RTMLockingCounters*) 70 // flags are killed 71 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 72 73 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 74 if (PrintPreciseRTMLockingStatistics) { 75 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 76 Label check_abort; 77 testl(abort_status, (1<<i)); 78 jccb(Assembler::equal, check_abort); 79 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 80 bind(check_abort); 81 } 82 } 83 } 84 85 // Branch if (random & (count-1) != 0), count is 2^n 86 // tmp, scr and flags are killed 87 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 88 assert(tmp == rax, ""); 89 assert(scr == rdx, ""); 90 rdtsc(); // modifies EDX:EAX 91 andptr(tmp, count-1); 92 jccb(Assembler::notZero, brLabel); 93 } 94 95 // Perform abort ratio calculation, set no_rtm bit if high ratio 96 // input: rtm_counters_Reg (RTMLockingCounters* address) 97 // tmpReg, rtm_counters_Reg and flags are killed 98 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 99 Register rtm_counters_Reg, 100 RTMLockingCounters* rtm_counters, 101 Metadata* method_data) { 102 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 103 104 if (RTMLockingCalculationDelay > 0) { 105 // Delay calculation 106 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg); 107 testptr(tmpReg, tmpReg); 108 jccb(Assembler::equal, L_done); 109 } 110 // Abort ratio calculation only if abort_count > RTMAbortThreshold 111 // Aborted transactions = abort_count * 100 112 // All transactions = total_count * RTMTotalCountIncrRate 113 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 114 115 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 116 cmpptr(tmpReg, RTMAbortThreshold); 117 jccb(Assembler::below, L_check_always_rtm2); 118 imulptr(tmpReg, tmpReg, 100); 119 120 Register scrReg = rtm_counters_Reg; 121 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 122 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 123 imulptr(scrReg, scrReg, RTMAbortRatio); 124 cmpptr(tmpReg, scrReg); 125 jccb(Assembler::below, L_check_always_rtm1); 126 if (method_data != NULL) { 127 // set rtm_state to "no rtm" in MDO 128 mov_metadata(tmpReg, method_data); 129 lock(); 130 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM); 131 } 132 jmpb(L_done); 133 bind(L_check_always_rtm1); 134 // Reload RTMLockingCounters* address 135 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 136 bind(L_check_always_rtm2); 137 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 138 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 139 jccb(Assembler::below, L_done); 140 if (method_data != NULL) { 141 // set rtm_state to "always rtm" in MDO 142 mov_metadata(tmpReg, method_data); 143 lock(); 144 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM); 145 } 146 bind(L_done); 147 } 148 149 // Update counters and perform abort ratio calculation 150 // input: abort_status_Reg 151 // rtm_counters_Reg, flags are killed 152 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 153 Register rtm_counters_Reg, 154 RTMLockingCounters* rtm_counters, 155 Metadata* method_data, 156 bool profile_rtm) { 157 158 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 159 // update rtm counters based on rax value at abort 160 // reads abort_status_Reg, updates flags 161 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 162 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 163 if (profile_rtm) { 164 // Save abort status because abort_status_Reg is used by following code. 165 if (RTMRetryCount > 0) { 166 push(abort_status_Reg); 167 } 168 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 169 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 170 // restore abort status 171 if (RTMRetryCount > 0) { 172 pop(abort_status_Reg); 173 } 174 } 175 } 176 177 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 178 // inputs: retry_count_Reg 179 // : abort_status_Reg 180 // output: retry_count_Reg decremented by 1 181 // flags are killed 182 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 183 Label doneRetry; 184 assert(abort_status_Reg == rax, ""); 185 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 186 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 187 // if reason is in 0x6 and retry count != 0 then retry 188 andptr(abort_status_Reg, 0x6); 189 jccb(Assembler::zero, doneRetry); 190 testl(retry_count_Reg, retry_count_Reg); 191 jccb(Assembler::zero, doneRetry); 192 pause(); 193 decrementl(retry_count_Reg); 194 jmp(retryLabel); 195 bind(doneRetry); 196 } 197 198 // Spin and retry if lock is busy, 199 // inputs: box_Reg (monitor address) 200 // : retry_count_Reg 201 // output: retry_count_Reg decremented by 1 202 // : clear z flag if retry count exceeded 203 // tmp_Reg, scr_Reg, flags are killed 204 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 205 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 206 Label SpinLoop, SpinExit, doneRetry; 207 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 208 209 testl(retry_count_Reg, retry_count_Reg); 210 jccb(Assembler::zero, doneRetry); 211 decrementl(retry_count_Reg); 212 movptr(scr_Reg, RTMSpinLoopCount); 213 214 bind(SpinLoop); 215 pause(); 216 decrementl(scr_Reg); 217 jccb(Assembler::lessEqual, SpinExit); 218 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 219 testptr(tmp_Reg, tmp_Reg); 220 jccb(Assembler::notZero, SpinLoop); 221 222 bind(SpinExit); 223 jmp(retryLabel); 224 bind(doneRetry); 225 incrementl(retry_count_Reg); // clear z flag 226 } 227 228 // Use RTM for normal stack locks 229 // Input: objReg (object to lock) 230 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 231 Register retry_on_abort_count_Reg, 232 RTMLockingCounters* stack_rtm_counters, 233 Metadata* method_data, bool profile_rtm, 234 Label& DONE_LABEL, Label& IsInflated) { 235 assert(UseRTMForStackLocks, "why call this otherwise?"); 236 assert(tmpReg == rax, ""); 237 assert(scrReg == rdx, ""); 238 Label L_rtm_retry, L_decrement_retry, L_on_abort; 239 240 if (RTMRetryCount > 0) { 241 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 242 bind(L_rtm_retry); 243 } 244 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 245 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 246 jcc(Assembler::notZero, IsInflated); 247 248 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 249 Label L_noincrement; 250 if (RTMTotalCountIncrRate > 1) { 251 // tmpReg, scrReg and flags are killed 252 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 253 } 254 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 255 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 256 bind(L_noincrement); 257 } 258 xbegin(L_on_abort); 259 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 260 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 261 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 262 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 263 264 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 265 if (UseRTMXendForLockBusy) { 266 xend(); 267 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 268 jmp(L_decrement_retry); 269 } 270 else { 271 xabort(0); 272 } 273 bind(L_on_abort); 274 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 275 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 276 } 277 bind(L_decrement_retry); 278 if (RTMRetryCount > 0) { 279 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 280 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 281 } 282 } 283 284 // Use RTM for inflating locks 285 // inputs: objReg (object to lock) 286 // boxReg (on-stack box address (displaced header location) - KILLED) 287 // tmpReg (ObjectMonitor address + markWord::monitor_value) 288 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 289 Register scrReg, Register retry_on_busy_count_Reg, 290 Register retry_on_abort_count_Reg, 291 RTMLockingCounters* rtm_counters, 292 Metadata* method_data, bool profile_rtm, 293 Label& DONE_LABEL) { 294 assert(UseRTMLocking, "why call this otherwise?"); 295 assert(tmpReg == rax, ""); 296 assert(scrReg == rdx, ""); 297 Label L_rtm_retry, L_decrement_retry, L_on_abort; 298 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 299 300 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 301 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 302 movptr(boxReg, tmpReg); // Save ObjectMonitor address 303 304 if (RTMRetryCount > 0) { 305 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 306 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 307 bind(L_rtm_retry); 308 } 309 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 310 Label L_noincrement; 311 if (RTMTotalCountIncrRate > 1) { 312 // tmpReg, scrReg and flags are killed 313 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 314 } 315 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 316 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 317 bind(L_noincrement); 318 } 319 xbegin(L_on_abort); 320 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 321 movptr(tmpReg, Address(tmpReg, owner_offset)); 322 testptr(tmpReg, tmpReg); 323 jcc(Assembler::zero, DONE_LABEL); 324 if (UseRTMXendForLockBusy) { 325 xend(); 326 jmp(L_decrement_retry); 327 } 328 else { 329 xabort(0); 330 } 331 bind(L_on_abort); 332 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 333 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 334 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 335 } 336 if (RTMRetryCount > 0) { 337 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 338 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 339 } 340 341 movptr(tmpReg, Address(boxReg, owner_offset)) ; 342 testptr(tmpReg, tmpReg) ; 343 jccb(Assembler::notZero, L_decrement_retry) ; 344 345 // Appears unlocked - try to swing _owner from null to non-null. 346 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 347 #ifdef _LP64 348 Register threadReg = r15_thread; 349 #else 350 get_thread(scrReg); 351 Register threadReg = scrReg; 352 #endif 353 lock(); 354 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 355 356 if (RTMRetryCount > 0) { 357 // success done else retry 358 jccb(Assembler::equal, DONE_LABEL) ; 359 bind(L_decrement_retry); 360 // Spin and retry if lock is busy. 361 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 362 } 363 else { 364 bind(L_decrement_retry); 365 } 366 } 367 368 #endif // INCLUDE_RTM_OPT 369 370 // fast_lock and fast_unlock used by C2 371 372 // Because the transitions from emitted code to the runtime 373 // monitorenter/exit helper stubs are so slow it's critical that 374 // we inline both the stack-locking fast path and the inflated fast path. 375 // 376 // See also: cmpFastLock and cmpFastUnlock. 377 // 378 // What follows is a specialized inline transliteration of the code 379 // in enter() and exit(). If we're concerned about I$ bloat another 380 // option would be to emit TrySlowEnter and TrySlowExit methods 381 // at startup-time. These methods would accept arguments as 382 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 383 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 384 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 385 // In practice, however, the # of lock sites is bounded and is usually small. 386 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 387 // if the processor uses simple bimodal branch predictors keyed by EIP 388 // Since the helper routines would be called from multiple synchronization 389 // sites. 390 // 391 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 392 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 393 // to those specialized methods. That'd give us a mostly platform-independent 394 // implementation that the JITs could optimize and inline at their pleasure. 395 // Done correctly, the only time we'd need to cross to native could would be 396 // to park() or unpark() threads. We'd also need a few more unsafe operators 397 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 398 // (b) explicit barriers or fence operations. 399 // 400 // TODO: 401 // 402 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 403 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 404 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 405 // the lock operators would typically be faster than reifying Self. 406 // 407 // * Ideally I'd define the primitives as: 408 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 409 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 410 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 411 // Instead, we're stuck with a rather awkward and brittle register assignments below. 412 // Furthermore the register assignments are overconstrained, possibly resulting in 413 // sub-optimal code near the synchronization site. 414 // 415 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 416 // Alternately, use a better sp-proximity test. 417 // 418 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 419 // Either one is sufficient to uniquely identify a thread. 420 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 421 // 422 // * Intrinsify notify() and notifyAll() for the common cases where the 423 // object is locked by the calling thread but the waitlist is empty. 424 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 425 // 426 // * use jccb and jmpb instead of jcc and jmp to improve code density. 427 // But beware of excessive branch density on AMD Opterons. 428 // 429 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 430 // or failure of the fast path. If the fast path fails then we pass 431 // control to the slow path, typically in C. In fast_lock and 432 // fast_unlock we often branch to DONE_LABEL, just to find that C2 433 // will emit a conditional branch immediately after the node. 434 // So we have branches to branches and lots of ICC.ZF games. 435 // Instead, it might be better to have C2 pass a "FailureLabel" 436 // into fast_lock and fast_unlock. In the case of success, control 437 // will drop through the node. ICC.ZF is undefined at exit. 438 // In the case of failure, the node will branch directly to the 439 // FailureLabel 440 441 442 // obj: object to lock 443 // box: on-stack box address (displaced header location) - KILLED 444 // rax,: tmp -- KILLED 445 // scr: tmp -- KILLED 446 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 447 Register scrReg, Register cx1Reg, Register cx2Reg, 448 RTMLockingCounters* rtm_counters, 449 RTMLockingCounters* stack_rtm_counters, 450 Metadata* method_data, 451 bool use_rtm, bool profile_rtm) { 452 // Ensure the register assignments are disjoint 453 assert(tmpReg == rax, ""); 454 455 if (use_rtm) { 456 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 457 } else { 458 assert(cx2Reg == noreg, ""); 459 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 460 } 461 462 // Possible cases that we'll encounter in fast_lock 463 // ------------------------------------------------ 464 // * Inflated 465 // -- unlocked 466 // -- Locked 467 // = by self 468 // = by other 469 // * neutral 470 // * stack-locked 471 // -- by self 472 // = sp-proximity test hits 473 // = sp-proximity test generates false-negative 474 // -- by other 475 // 476 477 Label IsInflated, DONE_LABEL; 478 479 if (DiagnoseSyncOnValueBasedClasses != 0) { 480 load_klass(tmpReg, objReg, cx1Reg); 481 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 482 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 483 jcc(Assembler::notZero, DONE_LABEL); 484 } 485 486 #if INCLUDE_RTM_OPT 487 if (UseRTMForStackLocks && use_rtm) { 488 assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive"); 489 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 490 stack_rtm_counters, method_data, profile_rtm, 491 DONE_LABEL, IsInflated); 492 } 493 #endif // INCLUDE_RTM_OPT 494 495 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 496 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 497 jccb(Assembler::notZero, IsInflated); 498 499 if (!UseHeavyMonitors) { 500 // Attempt stack-locking ... 501 orptr (tmpReg, markWord::unlocked_value); 502 if (EnableValhalla) { 503 // Mask inline_type bit such that we go to the slow path if object is an inline type 504 andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place)); 505 } 506 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 507 lock(); 508 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 509 jcc(Assembler::equal, DONE_LABEL); // Success 510 511 // Recursive locking. 512 // The object is stack-locked: markword contains stack pointer to BasicLock. 513 // Locked by current thread if difference with current SP is less than one page. 514 subptr(tmpReg, rsp); 515 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 516 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) ); 517 movptr(Address(boxReg, 0), tmpReg); 518 } else { 519 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 520 testptr(objReg, objReg); 521 } 522 jmp(DONE_LABEL); 523 524 bind(IsInflated); 525 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 526 527 #if INCLUDE_RTM_OPT 528 // Use the same RTM locking code in 32- and 64-bit VM. 529 if (use_rtm) { 530 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 531 rtm_counters, method_data, profile_rtm, DONE_LABEL); 532 } else { 533 #endif // INCLUDE_RTM_OPT 534 535 #ifndef _LP64 536 // The object is inflated. 537 538 // boxReg refers to the on-stack BasicLock in the current frame. 539 // We'd like to write: 540 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 541 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 542 // additional latency as we have another ST in the store buffer that must drain. 543 544 // avoid ST-before-CAS 545 // register juggle because we need tmpReg for cmpxchgptr below 546 movptr(scrReg, boxReg); 547 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 548 549 // Optimistic form: consider XORL tmpReg,tmpReg 550 movptr(tmpReg, NULL_WORD); 551 552 // Appears unlocked - try to swing _owner from null to non-null. 553 // Ideally, I'd manifest "Self" with get_thread and then attempt 554 // to CAS the register containing Self into m->Owner. 555 // But we don't have enough registers, so instead we can either try to CAS 556 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 557 // we later store "Self" into m->Owner. Transiently storing a stack address 558 // (rsp or the address of the box) into m->owner is harmless. 559 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 560 lock(); 561 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 562 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 563 // If we weren't able to swing _owner from NULL to the BasicLock 564 // then take the slow path. 565 jccb (Assembler::notZero, DONE_LABEL); 566 // update _owner from BasicLock to thread 567 get_thread (scrReg); // beware: clobbers ICCs 568 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 569 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 570 571 // If the CAS fails we can either retry or pass control to the slow path. 572 // We use the latter tactic. 573 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 574 // If the CAS was successful ... 575 // Self has acquired the lock 576 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 577 // Intentional fall-through into DONE_LABEL ... 578 #else // _LP64 579 // It's inflated and we use scrReg for ObjectMonitor* in this section. 580 movq(scrReg, tmpReg); 581 xorq(tmpReg, tmpReg); 582 lock(); 583 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 584 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 585 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 586 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 587 // Propagate ICC.ZF from CAS above into DONE_LABEL. 588 jcc(Assembler::equal, DONE_LABEL); // CAS above succeeded; propagate ZF = 1 (success) 589 590 cmpptr(r15_thread, rax); // Check if we are already the owner (recursive lock) 591 jcc(Assembler::notEqual, DONE_LABEL); // If not recursive, ZF = 0 at this point (fail) 592 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 593 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 594 #endif // _LP64 595 #if INCLUDE_RTM_OPT 596 } // use_rtm() 597 #endif 598 // DONE_LABEL is a hot target - we'd really like to place it at the 599 // start of cache line by padding with NOPs. 600 // See the AMD and Intel software optimization manuals for the 601 // most efficient "long" NOP encodings. 602 // Unfortunately none of our alignment mechanisms suffice. 603 bind(DONE_LABEL); 604 605 // At DONE_LABEL the icc ZFlag is set as follows ... 606 // fast_unlock uses the same protocol. 607 // ZFlag == 1 -> Success 608 // ZFlag == 0 -> Failure - force control through the slow path 609 } 610 611 // obj: object to unlock 612 // box: box address (displaced header location), killed. Must be EAX. 613 // tmp: killed, cannot be obj nor box. 614 // 615 // Some commentary on balanced locking: 616 // 617 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 618 // Methods that don't have provably balanced locking are forced to run in the 619 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 620 // The interpreter provides two properties: 621 // I1: At return-time the interpreter automatically and quietly unlocks any 622 // objects acquired the current activation (frame). Recall that the 623 // interpreter maintains an on-stack list of locks currently held by 624 // a frame. 625 // I2: If a method attempts to unlock an object that is not held by the 626 // the frame the interpreter throws IMSX. 627 // 628 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 629 // B() doesn't have provably balanced locking so it runs in the interpreter. 630 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 631 // is still locked by A(). 632 // 633 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 634 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 635 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 636 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 637 // Arguably given that the spec legislates the JNI case as undefined our implementation 638 // could reasonably *avoid* checking owner in fast_unlock(). 639 // In the interest of performance we elide m->Owner==Self check in unlock. 640 // A perfectly viable alternative is to elide the owner check except when 641 // Xcheck:jni is enabled. 642 643 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 644 assert(boxReg == rax, ""); 645 assert_different_registers(objReg, boxReg, tmpReg); 646 647 Label DONE_LABEL, Stacked, CheckSucc; 648 649 #if INCLUDE_RTM_OPT 650 if (UseRTMForStackLocks && use_rtm) { 651 assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive"); 652 Label L_regular_unlock; 653 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 654 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 655 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 656 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 657 xend(); // otherwise end... 658 jmp(DONE_LABEL); // ... and we're done 659 bind(L_regular_unlock); 660 } 661 #endif 662 663 if (!UseHeavyMonitors) { 664 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header 665 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock 666 } 667 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 668 if (!UseHeavyMonitors) { 669 testptr(tmpReg, markWord::monitor_value); // Inflated? 670 jccb (Assembler::zero, Stacked); 671 } 672 673 // It's inflated. 674 #if INCLUDE_RTM_OPT 675 if (use_rtm) { 676 Label L_regular_inflated_unlock; 677 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 678 movptr(boxReg, Address(tmpReg, owner_offset)); 679 testptr(boxReg, boxReg); 680 jccb(Assembler::notZero, L_regular_inflated_unlock); 681 xend(); 682 jmpb(DONE_LABEL); 683 bind(L_regular_inflated_unlock); 684 } 685 #endif 686 687 // Despite our balanced locking property we still check that m->_owner == Self 688 // as java routines or native JNI code called by this thread might 689 // have released the lock. 690 // Refer to the comments in synchronizer.cpp for how we might encode extra 691 // state in _succ so we can avoid fetching EntryList|cxq. 692 // 693 // If there's no contention try a 1-0 exit. That is, exit without 694 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 695 // we detect and recover from the race that the 1-0 exit admits. 696 // 697 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 698 // before it STs null into _owner, releasing the lock. Updates 699 // to data protected by the critical section must be visible before 700 // we drop the lock (and thus before any other thread could acquire 701 // the lock and observe the fields protected by the lock). 702 // IA32's memory-model is SPO, so STs are ordered with respect to 703 // each other and there's no need for an explicit barrier (fence). 704 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 705 #ifndef _LP64 706 get_thread (boxReg); 707 708 // Note that we could employ various encoding schemes to reduce 709 // the number of loads below (currently 4) to just 2 or 3. 710 // Refer to the comments in synchronizer.cpp. 711 // In practice the chain of fetches doesn't seem to impact performance, however. 712 xorptr(boxReg, boxReg); 713 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 714 jccb (Assembler::notZero, DONE_LABEL); 715 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 716 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 717 jccb (Assembler::notZero, CheckSucc); 718 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 719 jmpb (DONE_LABEL); 720 721 bind (Stacked); 722 // It's not inflated and it's not recursively stack-locked. 723 // It must be stack-locked. 724 // Try to reset the header to displaced header. 725 // The "box" value on the stack is stable, so we can reload 726 // and be assured we observe the same value as above. 727 movptr(tmpReg, Address(boxReg, 0)); 728 lock(); 729 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 730 // Intention fall-thru into DONE_LABEL 731 732 // DONE_LABEL is a hot target - we'd really like to place it at the 733 // start of cache line by padding with NOPs. 734 // See the AMD and Intel software optimization manuals for the 735 // most efficient "long" NOP encodings. 736 // Unfortunately none of our alignment mechanisms suffice. 737 bind (CheckSucc); 738 #else // _LP64 739 // It's inflated 740 Label LNotRecursive, LSuccess, LGoSlowPath; 741 742 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 743 jccb(Assembler::equal, LNotRecursive); 744 745 // Recursive inflated unlock 746 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 747 jmpb(LSuccess); 748 749 bind(LNotRecursive); 750 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 751 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 752 jccb (Assembler::notZero, CheckSucc); 753 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 754 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 755 jmpb (DONE_LABEL); 756 757 // Try to avoid passing control into the slow_path ... 758 bind (CheckSucc); 759 760 // The following optional optimization can be elided if necessary 761 // Effectively: if (succ == null) goto slow path 762 // The code reduces the window for a race, however, 763 // and thus benefits performance. 764 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 765 jccb (Assembler::zero, LGoSlowPath); 766 767 xorptr(boxReg, boxReg); 768 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 769 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 770 771 // Memory barrier/fence 772 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 773 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 774 // This is faster on Nehalem and AMD Shanghai/Barcelona. 775 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 776 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 777 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 778 lock(); addl(Address(rsp, 0), 0); 779 780 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 781 jccb (Assembler::notZero, LSuccess); 782 783 // Rare inopportune interleaving - race. 784 // The successor vanished in the small window above. 785 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 786 // We need to ensure progress and succession. 787 // Try to reacquire the lock. 788 // If that fails then the new owner is responsible for succession and this 789 // thread needs to take no further action and can exit via the fast path (success). 790 // If the re-acquire succeeds then pass control into the slow path. 791 // As implemented, this latter mode is horrible because we generated more 792 // coherence traffic on the lock *and* artifically extended the critical section 793 // length while by virtue of passing control into the slow path. 794 795 // box is really RAX -- the following CMPXCHG depends on that binding 796 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 797 lock(); 798 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 799 // There's no successor so we tried to regrab the lock. 800 // If that didn't work, then another thread grabbed the 801 // lock so we're done (and exit was a success). 802 jccb (Assembler::notEqual, LSuccess); 803 // Intentional fall-through into slow path 804 805 bind (LGoSlowPath); 806 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 807 jmpb (DONE_LABEL); 808 809 bind (LSuccess); 810 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 811 jmpb (DONE_LABEL); 812 813 if (!UseHeavyMonitors) { 814 bind (Stacked); 815 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 816 lock(); 817 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 818 } 819 #endif 820 bind(DONE_LABEL); 821 } 822 823 //------------------------------------------------------------------------------------------- 824 // Generic instructions support for use in .ad files C2 code generation 825 826 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 827 if (dst != src) { 828 movdqu(dst, src); 829 } 830 if (opcode == Op_AbsVD) { 831 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr); 832 } else { 833 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 834 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr); 835 } 836 } 837 838 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 839 if (opcode == Op_AbsVD) { 840 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr); 841 } else { 842 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 843 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr); 844 } 845 } 846 847 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 848 if (dst != src) { 849 movdqu(dst, src); 850 } 851 if (opcode == Op_AbsVF) { 852 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr); 853 } else { 854 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 855 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr); 856 } 857 } 858 859 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 860 if (opcode == Op_AbsVF) { 861 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr); 862 } else { 863 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 864 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr); 865 } 866 } 867 868 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 869 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 870 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 871 872 if (opcode == Op_MinV) { 873 if (elem_bt == T_BYTE) { 874 pminsb(dst, src); 875 } else if (elem_bt == T_SHORT) { 876 pminsw(dst, src); 877 } else if (elem_bt == T_INT) { 878 pminsd(dst, src); 879 } else { 880 assert(elem_bt == T_LONG, "required"); 881 assert(tmp == xmm0, "required"); 882 assert_different_registers(dst, src, tmp); 883 movdqu(xmm0, dst); 884 pcmpgtq(xmm0, src); 885 blendvpd(dst, src); // xmm0 as mask 886 } 887 } else { // opcode == Op_MaxV 888 if (elem_bt == T_BYTE) { 889 pmaxsb(dst, src); 890 } else if (elem_bt == T_SHORT) { 891 pmaxsw(dst, src); 892 } else if (elem_bt == T_INT) { 893 pmaxsd(dst, src); 894 } else { 895 assert(elem_bt == T_LONG, "required"); 896 assert(tmp == xmm0, "required"); 897 assert_different_registers(dst, src, tmp); 898 movdqu(xmm0, src); 899 pcmpgtq(xmm0, dst); 900 blendvpd(dst, src); // xmm0 as mask 901 } 902 } 903 } 904 905 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 906 XMMRegister dst, XMMRegister src1, XMMRegister src2, 907 int vlen_enc) { 908 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 909 910 if (opcode == Op_MinV) { 911 if (elem_bt == T_BYTE) { 912 vpminsb(dst, src1, src2, vlen_enc); 913 } else if (elem_bt == T_SHORT) { 914 vpminsw(dst, src1, src2, vlen_enc); 915 } else if (elem_bt == T_INT) { 916 vpminsd(dst, src1, src2, vlen_enc); 917 } else { 918 assert(elem_bt == T_LONG, "required"); 919 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 920 vpminsq(dst, src1, src2, vlen_enc); 921 } else { 922 assert_different_registers(dst, src1, src2); 923 vpcmpgtq(dst, src1, src2, vlen_enc); 924 vblendvpd(dst, src1, src2, dst, vlen_enc); 925 } 926 } 927 } else { // opcode == Op_MaxV 928 if (elem_bt == T_BYTE) { 929 vpmaxsb(dst, src1, src2, vlen_enc); 930 } else if (elem_bt == T_SHORT) { 931 vpmaxsw(dst, src1, src2, vlen_enc); 932 } else if (elem_bt == T_INT) { 933 vpmaxsd(dst, src1, src2, vlen_enc); 934 } else { 935 assert(elem_bt == T_LONG, "required"); 936 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 937 vpmaxsq(dst, src1, src2, vlen_enc); 938 } else { 939 assert_different_registers(dst, src1, src2); 940 vpcmpgtq(dst, src1, src2, vlen_enc); 941 vblendvpd(dst, src2, src1, dst, vlen_enc); 942 } 943 } 944 } 945 } 946 947 // Float/Double min max 948 949 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 950 XMMRegister dst, XMMRegister a, XMMRegister b, 951 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 952 int vlen_enc) { 953 assert(UseAVX > 0, "required"); 954 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 955 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 956 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 957 assert_different_registers(a, b, tmp, atmp, btmp); 958 959 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 960 bool is_double_word = is_double_word_type(elem_bt); 961 962 if (!is_double_word && is_min) { 963 vblendvps(atmp, a, b, a, vlen_enc); 964 vblendvps(btmp, b, a, a, vlen_enc); 965 vminps(tmp, atmp, btmp, vlen_enc); 966 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 967 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 968 } else if (!is_double_word && !is_min) { 969 vblendvps(btmp, b, a, b, vlen_enc); 970 vblendvps(atmp, a, b, b, vlen_enc); 971 vmaxps(tmp, atmp, btmp, vlen_enc); 972 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 973 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 974 } else if (is_double_word && is_min) { 975 vblendvpd(atmp, a, b, a, vlen_enc); 976 vblendvpd(btmp, b, a, a, vlen_enc); 977 vminpd(tmp, atmp, btmp, vlen_enc); 978 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 979 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 980 } else { 981 assert(is_double_word && !is_min, "sanity"); 982 vblendvpd(btmp, b, a, b, vlen_enc); 983 vblendvpd(atmp, a, b, b, vlen_enc); 984 vmaxpd(tmp, atmp, btmp, vlen_enc); 985 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 986 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 987 } 988 } 989 990 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 991 XMMRegister dst, XMMRegister a, XMMRegister b, 992 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 993 int vlen_enc) { 994 assert(UseAVX > 2, "required"); 995 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 996 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 997 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 998 assert_different_registers(dst, a, b, atmp, btmp); 999 1000 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1001 bool is_double_word = is_double_word_type(elem_bt); 1002 bool merge = true; 1003 1004 if (!is_double_word && is_min) { 1005 evpmovd2m(ktmp, a, vlen_enc); 1006 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1007 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1008 vminps(dst, atmp, btmp, vlen_enc); 1009 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1010 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1011 } else if (!is_double_word && !is_min) { 1012 evpmovd2m(ktmp, b, vlen_enc); 1013 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1014 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1015 vmaxps(dst, atmp, btmp, vlen_enc); 1016 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1017 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1018 } else if (is_double_word && is_min) { 1019 evpmovq2m(ktmp, a, vlen_enc); 1020 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1021 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1022 vminpd(dst, atmp, btmp, vlen_enc); 1023 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1024 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1025 } else { 1026 assert(is_double_word && !is_min, "sanity"); 1027 evpmovq2m(ktmp, b, vlen_enc); 1028 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1029 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1030 vmaxpd(dst, atmp, btmp, vlen_enc); 1031 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1032 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1033 } 1034 } 1035 1036 // Float/Double signum 1037 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, 1038 XMMRegister zero, XMMRegister one, 1039 Register scratch) { 1040 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1041 1042 Label DONE_LABEL; 1043 1044 if (opcode == Op_SignumF) { 1045 assert(UseSSE > 0, "required"); 1046 ucomiss(dst, zero); 1047 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1048 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1049 movflt(dst, one); 1050 jcc(Assembler::above, DONE_LABEL); 1051 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch); 1052 } else if (opcode == Op_SignumD) { 1053 assert(UseSSE > 1, "required"); 1054 ucomisd(dst, zero); 1055 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1056 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1057 movdbl(dst, one); 1058 jcc(Assembler::above, DONE_LABEL); 1059 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch); 1060 } 1061 1062 bind(DONE_LABEL); 1063 } 1064 1065 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1066 if (sign) { 1067 pmovsxbw(dst, src); 1068 } else { 1069 pmovzxbw(dst, src); 1070 } 1071 } 1072 1073 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1074 if (sign) { 1075 vpmovsxbw(dst, src, vector_len); 1076 } else { 1077 vpmovzxbw(dst, src, vector_len); 1078 } 1079 } 1080 1081 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1082 if (sign) { 1083 vpmovsxbd(dst, src, vector_len); 1084 } else { 1085 vpmovzxbd(dst, src, vector_len); 1086 } 1087 } 1088 1089 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1090 if (sign) { 1091 vpmovsxwd(dst, src, vector_len); 1092 } else { 1093 vpmovzxwd(dst, src, vector_len); 1094 } 1095 } 1096 1097 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1098 int shift, int vector_len) { 1099 if (opcode == Op_RotateLeftV) { 1100 if (etype == T_INT) { 1101 evprold(dst, src, shift, vector_len); 1102 } else { 1103 assert(etype == T_LONG, "expected type T_LONG"); 1104 evprolq(dst, src, shift, vector_len); 1105 } 1106 } else { 1107 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1108 if (etype == T_INT) { 1109 evprord(dst, src, shift, vector_len); 1110 } else { 1111 assert(etype == T_LONG, "expected type T_LONG"); 1112 evprorq(dst, src, shift, vector_len); 1113 } 1114 } 1115 } 1116 1117 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1118 XMMRegister shift, int vector_len) { 1119 if (opcode == Op_RotateLeftV) { 1120 if (etype == T_INT) { 1121 evprolvd(dst, src, shift, vector_len); 1122 } else { 1123 assert(etype == T_LONG, "expected type T_LONG"); 1124 evprolvq(dst, src, shift, vector_len); 1125 } 1126 } else { 1127 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1128 if (etype == T_INT) { 1129 evprorvd(dst, src, shift, vector_len); 1130 } else { 1131 assert(etype == T_LONG, "expected type T_LONG"); 1132 evprorvq(dst, src, shift, vector_len); 1133 } 1134 } 1135 } 1136 1137 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1138 if (opcode == Op_RShiftVI) { 1139 psrad(dst, shift); 1140 } else if (opcode == Op_LShiftVI) { 1141 pslld(dst, shift); 1142 } else { 1143 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1144 psrld(dst, shift); 1145 } 1146 } 1147 1148 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1149 switch (opcode) { 1150 case Op_RShiftVI: psrad(dst, shift); break; 1151 case Op_LShiftVI: pslld(dst, shift); break; 1152 case Op_URShiftVI: psrld(dst, shift); break; 1153 1154 default: assert(false, "%s", NodeClassNames[opcode]); 1155 } 1156 } 1157 1158 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1159 if (opcode == Op_RShiftVI) { 1160 vpsrad(dst, nds, shift, vector_len); 1161 } else if (opcode == Op_LShiftVI) { 1162 vpslld(dst, nds, shift, vector_len); 1163 } else { 1164 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1165 vpsrld(dst, nds, shift, vector_len); 1166 } 1167 } 1168 1169 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1170 switch (opcode) { 1171 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1172 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1173 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1174 1175 default: assert(false, "%s", NodeClassNames[opcode]); 1176 } 1177 } 1178 1179 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1180 switch (opcode) { 1181 case Op_RShiftVB: // fall-through 1182 case Op_RShiftVS: psraw(dst, shift); break; 1183 1184 case Op_LShiftVB: // fall-through 1185 case Op_LShiftVS: psllw(dst, shift); break; 1186 1187 case Op_URShiftVS: // fall-through 1188 case Op_URShiftVB: psrlw(dst, shift); break; 1189 1190 default: assert(false, "%s", NodeClassNames[opcode]); 1191 } 1192 } 1193 1194 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1195 switch (opcode) { 1196 case Op_RShiftVB: // fall-through 1197 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1198 1199 case Op_LShiftVB: // fall-through 1200 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1201 1202 case Op_URShiftVS: // fall-through 1203 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1204 1205 default: assert(false, "%s", NodeClassNames[opcode]); 1206 } 1207 } 1208 1209 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1210 switch (opcode) { 1211 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1212 case Op_LShiftVL: psllq(dst, shift); break; 1213 case Op_URShiftVL: psrlq(dst, shift); break; 1214 1215 default: assert(false, "%s", NodeClassNames[opcode]); 1216 } 1217 } 1218 1219 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1220 if (opcode == Op_RShiftVL) { 1221 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1222 } else if (opcode == Op_LShiftVL) { 1223 psllq(dst, shift); 1224 } else { 1225 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1226 psrlq(dst, shift); 1227 } 1228 } 1229 1230 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1231 switch (opcode) { 1232 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1233 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1234 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1235 1236 default: assert(false, "%s", NodeClassNames[opcode]); 1237 } 1238 } 1239 1240 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1241 if (opcode == Op_RShiftVL) { 1242 evpsraq(dst, nds, shift, vector_len); 1243 } else if (opcode == Op_LShiftVL) { 1244 vpsllq(dst, nds, shift, vector_len); 1245 } else { 1246 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1247 vpsrlq(dst, nds, shift, vector_len); 1248 } 1249 } 1250 1251 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1252 switch (opcode) { 1253 case Op_RShiftVB: // fall-through 1254 case Op_RShiftVS: // fall-through 1255 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1256 1257 case Op_LShiftVB: // fall-through 1258 case Op_LShiftVS: // fall-through 1259 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1260 1261 case Op_URShiftVB: // fall-through 1262 case Op_URShiftVS: // fall-through 1263 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1264 1265 default: assert(false, "%s", NodeClassNames[opcode]); 1266 } 1267 } 1268 1269 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1270 switch (opcode) { 1271 case Op_RShiftVB: // fall-through 1272 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1273 1274 case Op_LShiftVB: // fall-through 1275 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1276 1277 case Op_URShiftVB: // fall-through 1278 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1279 1280 default: assert(false, "%s", NodeClassNames[opcode]); 1281 } 1282 } 1283 1284 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1285 assert(UseAVX >= 2, "required"); 1286 switch (opcode) { 1287 case Op_RShiftVL: { 1288 if (UseAVX > 2) { 1289 assert(tmp == xnoreg, "not used"); 1290 if (!VM_Version::supports_avx512vl()) { 1291 vlen_enc = Assembler::AVX_512bit; 1292 } 1293 evpsravq(dst, src, shift, vlen_enc); 1294 } else { 1295 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1296 vpsrlvq(dst, src, shift, vlen_enc); 1297 vpsrlvq(tmp, tmp, shift, vlen_enc); 1298 vpxor(dst, dst, tmp, vlen_enc); 1299 vpsubq(dst, dst, tmp, vlen_enc); 1300 } 1301 break; 1302 } 1303 case Op_LShiftVL: { 1304 assert(tmp == xnoreg, "not used"); 1305 vpsllvq(dst, src, shift, vlen_enc); 1306 break; 1307 } 1308 case Op_URShiftVL: { 1309 assert(tmp == xnoreg, "not used"); 1310 vpsrlvq(dst, src, shift, vlen_enc); 1311 break; 1312 } 1313 default: assert(false, "%s", NodeClassNames[opcode]); 1314 } 1315 } 1316 1317 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1318 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { 1319 assert(opcode == Op_LShiftVB || 1320 opcode == Op_RShiftVB || 1321 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1322 bool sign = (opcode != Op_URShiftVB); 1323 assert(vector_len == 0, "required"); 1324 vextendbd(sign, dst, src, 1); 1325 vpmovzxbd(vtmp, shift, 1); 1326 varshiftd(opcode, dst, dst, vtmp, 1); 1327 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch); 1328 vextracti128_high(vtmp, dst); 1329 vpackusdw(dst, dst, vtmp, 0); 1330 } 1331 1332 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1333 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { 1334 assert(opcode == Op_LShiftVB || 1335 opcode == Op_RShiftVB || 1336 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1337 bool sign = (opcode != Op_URShiftVB); 1338 int ext_vector_len = vector_len + 1; 1339 vextendbw(sign, dst, src, ext_vector_len); 1340 vpmovzxbw(vtmp, shift, ext_vector_len); 1341 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1342 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch); 1343 if (vector_len == 0) { 1344 vextracti128_high(vtmp, dst); 1345 vpackuswb(dst, dst, vtmp, vector_len); 1346 } else { 1347 vextracti64x4_high(vtmp, dst); 1348 vpackuswb(dst, dst, vtmp, vector_len); 1349 vpermq(dst, dst, 0xD8, vector_len); 1350 } 1351 } 1352 1353 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1354 switch(typ) { 1355 case T_BYTE: 1356 pinsrb(dst, val, idx); 1357 break; 1358 case T_SHORT: 1359 pinsrw(dst, val, idx); 1360 break; 1361 case T_INT: 1362 pinsrd(dst, val, idx); 1363 break; 1364 case T_LONG: 1365 pinsrq(dst, val, idx); 1366 break; 1367 default: 1368 assert(false,"Should not reach here."); 1369 break; 1370 } 1371 } 1372 1373 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1374 switch(typ) { 1375 case T_BYTE: 1376 vpinsrb(dst, src, val, idx); 1377 break; 1378 case T_SHORT: 1379 vpinsrw(dst, src, val, idx); 1380 break; 1381 case T_INT: 1382 vpinsrd(dst, src, val, idx); 1383 break; 1384 case T_LONG: 1385 vpinsrq(dst, src, val, idx); 1386 break; 1387 default: 1388 assert(false,"Should not reach here."); 1389 break; 1390 } 1391 } 1392 1393 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1394 switch(typ) { 1395 case T_INT: 1396 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1397 break; 1398 case T_FLOAT: 1399 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1400 break; 1401 case T_LONG: 1402 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1403 break; 1404 case T_DOUBLE: 1405 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1406 break; 1407 default: 1408 assert(false,"Should not reach here."); 1409 break; 1410 } 1411 } 1412 1413 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1414 switch(typ) { 1415 case T_INT: 1416 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1417 break; 1418 case T_FLOAT: 1419 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1420 break; 1421 case T_LONG: 1422 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1423 break; 1424 case T_DOUBLE: 1425 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1426 break; 1427 default: 1428 assert(false,"Should not reach here."); 1429 break; 1430 } 1431 } 1432 1433 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1434 switch(typ) { 1435 case T_INT: 1436 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1437 break; 1438 case T_FLOAT: 1439 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1440 break; 1441 case T_LONG: 1442 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1443 break; 1444 case T_DOUBLE: 1445 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1446 break; 1447 default: 1448 assert(false,"Should not reach here."); 1449 break; 1450 } 1451 } 1452 1453 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1454 if (vlen_in_bytes <= 16) { 1455 pxor (dst, dst); 1456 psubb(dst, src); 1457 switch (elem_bt) { 1458 case T_BYTE: /* nothing to do */ break; 1459 case T_SHORT: pmovsxbw(dst, dst); break; 1460 case T_INT: pmovsxbd(dst, dst); break; 1461 case T_FLOAT: pmovsxbd(dst, dst); break; 1462 case T_LONG: pmovsxbq(dst, dst); break; 1463 case T_DOUBLE: pmovsxbq(dst, dst); break; 1464 1465 default: assert(false, "%s", type2name(elem_bt)); 1466 } 1467 } else { 1468 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1469 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1470 1471 vpxor (dst, dst, dst, vlen_enc); 1472 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1473 1474 switch (elem_bt) { 1475 case T_BYTE: /* nothing to do */ break; 1476 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1477 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1478 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1479 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1480 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1481 1482 default: assert(false, "%s", type2name(elem_bt)); 1483 } 1484 } 1485 } 1486 1487 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, 1488 Register tmp, bool novlbwdq, int vlen_enc) { 1489 if (novlbwdq) { 1490 vpmovsxbd(xtmp, src, vlen_enc); 1491 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1492 Assembler::eq, true, vlen_enc, tmp); 1493 } else { 1494 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1495 vpsubb(xtmp, xtmp, src, vlen_enc); 1496 evpmovb2m(dst, xtmp, vlen_enc); 1497 } 1498 } 1499 1500 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1501 switch (vlen_in_bytes) { 1502 case 4: movdl(dst, src); break; 1503 case 8: movq(dst, src); break; 1504 case 16: movdqu(dst, src); break; 1505 case 32: vmovdqu(dst, src); break; 1506 case 64: evmovdquq(dst, src, Assembler::AVX_512bit); break; 1507 default: ShouldNotReachHere(); 1508 } 1509 } 1510 1511 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1512 if (reachable(src)) { 1513 load_vector(dst, as_Address(src), vlen_in_bytes); 1514 } else { 1515 lea(rscratch, src); 1516 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1517 } 1518 } 1519 1520 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) { 1521 ExternalAddress addr(StubRoutines::x86::vector_iota_indices()); 1522 if (vlen_in_bytes == 4) { 1523 movdl(dst, addr); 1524 } else if (vlen_in_bytes == 8) { 1525 movq(dst, addr); 1526 } else if (vlen_in_bytes == 16) { 1527 movdqu(dst, addr, scratch); 1528 } else if (vlen_in_bytes == 32) { 1529 vmovdqu(dst, addr, scratch); 1530 } else { 1531 assert(vlen_in_bytes == 64, "%d", vlen_in_bytes); 1532 evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch); 1533 } 1534 } 1535 1536 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1537 1538 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1539 int vector_len = Assembler::AVX_128bit; 1540 1541 switch (opcode) { 1542 case Op_AndReductionV: pand(dst, src); break; 1543 case Op_OrReductionV: por (dst, src); break; 1544 case Op_XorReductionV: pxor(dst, src); break; 1545 case Op_MinReductionV: 1546 switch (typ) { 1547 case T_BYTE: pminsb(dst, src); break; 1548 case T_SHORT: pminsw(dst, src); break; 1549 case T_INT: pminsd(dst, src); break; 1550 case T_LONG: assert(UseAVX > 2, "required"); 1551 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1552 default: assert(false, "wrong type"); 1553 } 1554 break; 1555 case Op_MaxReductionV: 1556 switch (typ) { 1557 case T_BYTE: pmaxsb(dst, src); break; 1558 case T_SHORT: pmaxsw(dst, src); break; 1559 case T_INT: pmaxsd(dst, src); break; 1560 case T_LONG: assert(UseAVX > 2, "required"); 1561 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1562 default: assert(false, "wrong type"); 1563 } 1564 break; 1565 case Op_AddReductionVF: addss(dst, src); break; 1566 case Op_AddReductionVD: addsd(dst, src); break; 1567 case Op_AddReductionVI: 1568 switch (typ) { 1569 case T_BYTE: paddb(dst, src); break; 1570 case T_SHORT: paddw(dst, src); break; 1571 case T_INT: paddd(dst, src); break; 1572 default: assert(false, "wrong type"); 1573 } 1574 break; 1575 case Op_AddReductionVL: paddq(dst, src); break; 1576 case Op_MulReductionVF: mulss(dst, src); break; 1577 case Op_MulReductionVD: mulsd(dst, src); break; 1578 case Op_MulReductionVI: 1579 switch (typ) { 1580 case T_SHORT: pmullw(dst, src); break; 1581 case T_INT: pmulld(dst, src); break; 1582 default: assert(false, "wrong type"); 1583 } 1584 break; 1585 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1586 vpmullq(dst, dst, src, vector_len); break; 1587 default: assert(false, "wrong opcode"); 1588 } 1589 } 1590 1591 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1592 int vector_len = Assembler::AVX_256bit; 1593 1594 switch (opcode) { 1595 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1596 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1597 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1598 case Op_MinReductionV: 1599 switch (typ) { 1600 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1601 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1602 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1603 case T_LONG: assert(UseAVX > 2, "required"); 1604 vpminsq(dst, src1, src2, vector_len); break; 1605 default: assert(false, "wrong type"); 1606 } 1607 break; 1608 case Op_MaxReductionV: 1609 switch (typ) { 1610 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1611 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1612 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1613 case T_LONG: assert(UseAVX > 2, "required"); 1614 vpmaxsq(dst, src1, src2, vector_len); break; 1615 default: assert(false, "wrong type"); 1616 } 1617 break; 1618 case Op_AddReductionVI: 1619 switch (typ) { 1620 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1621 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1622 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1623 default: assert(false, "wrong type"); 1624 } 1625 break; 1626 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1627 case Op_MulReductionVI: 1628 switch (typ) { 1629 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1630 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1631 default: assert(false, "wrong type"); 1632 } 1633 break; 1634 case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break; 1635 default: assert(false, "wrong opcode"); 1636 } 1637 } 1638 1639 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1640 XMMRegister dst, XMMRegister src, 1641 XMMRegister vtmp1, XMMRegister vtmp2) { 1642 switch (opcode) { 1643 case Op_AddReductionVF: 1644 case Op_MulReductionVF: 1645 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1646 break; 1647 1648 case Op_AddReductionVD: 1649 case Op_MulReductionVD: 1650 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1651 break; 1652 1653 default: assert(false, "wrong opcode"); 1654 } 1655 } 1656 1657 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1658 Register dst, Register src1, XMMRegister src2, 1659 XMMRegister vtmp1, XMMRegister vtmp2) { 1660 switch (vlen) { 1661 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1662 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1663 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1664 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1665 1666 default: assert(false, "wrong vector length"); 1667 } 1668 } 1669 1670 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1671 Register dst, Register src1, XMMRegister src2, 1672 XMMRegister vtmp1, XMMRegister vtmp2) { 1673 switch (vlen) { 1674 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1675 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1676 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1677 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1678 1679 default: assert(false, "wrong vector length"); 1680 } 1681 } 1682 1683 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1684 Register dst, Register src1, XMMRegister src2, 1685 XMMRegister vtmp1, XMMRegister vtmp2) { 1686 switch (vlen) { 1687 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1688 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1689 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1690 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1691 1692 default: assert(false, "wrong vector length"); 1693 } 1694 } 1695 1696 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1697 Register dst, Register src1, XMMRegister src2, 1698 XMMRegister vtmp1, XMMRegister vtmp2) { 1699 switch (vlen) { 1700 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1701 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1702 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1703 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1704 1705 default: assert(false, "wrong vector length"); 1706 } 1707 } 1708 1709 #ifdef _LP64 1710 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1711 Register dst, Register src1, XMMRegister src2, 1712 XMMRegister vtmp1, XMMRegister vtmp2) { 1713 switch (vlen) { 1714 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1715 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1716 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1717 1718 default: assert(false, "wrong vector length"); 1719 } 1720 } 1721 #endif // _LP64 1722 1723 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1724 switch (vlen) { 1725 case 2: 1726 assert(vtmp2 == xnoreg, ""); 1727 reduce2F(opcode, dst, src, vtmp1); 1728 break; 1729 case 4: 1730 assert(vtmp2 == xnoreg, ""); 1731 reduce4F(opcode, dst, src, vtmp1); 1732 break; 1733 case 8: 1734 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1735 break; 1736 case 16: 1737 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1738 break; 1739 default: assert(false, "wrong vector length"); 1740 } 1741 } 1742 1743 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1744 switch (vlen) { 1745 case 2: 1746 assert(vtmp2 == xnoreg, ""); 1747 reduce2D(opcode, dst, src, vtmp1); 1748 break; 1749 case 4: 1750 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1751 break; 1752 case 8: 1753 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1754 break; 1755 default: assert(false, "wrong vector length"); 1756 } 1757 } 1758 1759 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1760 if (opcode == Op_AddReductionVI) { 1761 if (vtmp1 != src2) { 1762 movdqu(vtmp1, src2); 1763 } 1764 phaddd(vtmp1, vtmp1); 1765 } else { 1766 pshufd(vtmp1, src2, 0x1); 1767 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1768 } 1769 movdl(vtmp2, src1); 1770 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1771 movdl(dst, vtmp1); 1772 } 1773 1774 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1775 if (opcode == Op_AddReductionVI) { 1776 if (vtmp1 != src2) { 1777 movdqu(vtmp1, src2); 1778 } 1779 phaddd(vtmp1, src2); 1780 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1781 } else { 1782 pshufd(vtmp2, src2, 0xE); 1783 reduce_operation_128(T_INT, opcode, vtmp2, src2); 1784 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1785 } 1786 } 1787 1788 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1789 if (opcode == Op_AddReductionVI) { 1790 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1791 vextracti128_high(vtmp2, vtmp1); 1792 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1793 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1794 } else { 1795 vextracti128_high(vtmp1, src2); 1796 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1797 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1798 } 1799 } 1800 1801 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1802 vextracti64x4_high(vtmp2, src2); 1803 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 1804 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1805 } 1806 1807 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1808 pshufd(vtmp2, src2, 0x1); 1809 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1810 movdqu(vtmp1, vtmp2); 1811 psrldq(vtmp1, 2); 1812 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1813 movdqu(vtmp2, vtmp1); 1814 psrldq(vtmp2, 1); 1815 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1816 movdl(vtmp2, src1); 1817 pmovsxbd(vtmp1, vtmp1); 1818 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1819 pextrb(dst, vtmp1, 0x0); 1820 movsbl(dst, dst); 1821 } 1822 1823 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1824 pshufd(vtmp1, src2, 0xE); 1825 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 1826 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1827 } 1828 1829 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1830 vextracti128_high(vtmp2, src2); 1831 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1832 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1833 } 1834 1835 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1836 vextracti64x4_high(vtmp1, src2); 1837 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 1838 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1839 } 1840 1841 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1842 pmovsxbw(vtmp2, src2); 1843 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1844 } 1845 1846 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1847 if (UseAVX > 1) { 1848 int vector_len = Assembler::AVX_256bit; 1849 vpmovsxbw(vtmp1, src2, vector_len); 1850 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1851 } else { 1852 pmovsxbw(vtmp2, src2); 1853 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1854 pshufd(vtmp2, src2, 0x1); 1855 pmovsxbw(vtmp2, src2); 1856 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1857 } 1858 } 1859 1860 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1861 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 1862 int vector_len = Assembler::AVX_512bit; 1863 vpmovsxbw(vtmp1, src2, vector_len); 1864 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1865 } else { 1866 assert(UseAVX >= 2,"Should not reach here."); 1867 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 1868 vextracti128_high(vtmp2, src2); 1869 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1870 } 1871 } 1872 1873 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1874 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 1875 vextracti64x4_high(vtmp2, src2); 1876 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1877 } 1878 1879 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1880 if (opcode == Op_AddReductionVI) { 1881 if (vtmp1 != src2) { 1882 movdqu(vtmp1, src2); 1883 } 1884 phaddw(vtmp1, vtmp1); 1885 phaddw(vtmp1, vtmp1); 1886 } else { 1887 pshufd(vtmp2, src2, 0x1); 1888 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 1889 movdqu(vtmp1, vtmp2); 1890 psrldq(vtmp1, 2); 1891 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 1892 } 1893 movdl(vtmp2, src1); 1894 pmovsxwd(vtmp1, vtmp1); 1895 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1896 pextrw(dst, vtmp1, 0x0); 1897 movswl(dst, dst); 1898 } 1899 1900 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1901 if (opcode == Op_AddReductionVI) { 1902 if (vtmp1 != src2) { 1903 movdqu(vtmp1, src2); 1904 } 1905 phaddw(vtmp1, src2); 1906 } else { 1907 pshufd(vtmp1, src2, 0xE); 1908 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 1909 } 1910 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1911 } 1912 1913 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1914 if (opcode == Op_AddReductionVI) { 1915 int vector_len = Assembler::AVX_256bit; 1916 vphaddw(vtmp2, src2, src2, vector_len); 1917 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 1918 } else { 1919 vextracti128_high(vtmp2, src2); 1920 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 1921 } 1922 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1923 } 1924 1925 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1926 int vector_len = Assembler::AVX_256bit; 1927 vextracti64x4_high(vtmp1, src2); 1928 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 1929 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1930 } 1931 1932 #ifdef _LP64 1933 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1934 pshufd(vtmp2, src2, 0xE); 1935 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 1936 movdq(vtmp1, src1); 1937 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 1938 movdq(dst, vtmp1); 1939 } 1940 1941 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1942 vextracti128_high(vtmp1, src2); 1943 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 1944 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1945 } 1946 1947 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1948 vextracti64x4_high(vtmp2, src2); 1949 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 1950 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1951 } 1952 1953 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 1954 assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid"); 1955 mov64(temp, -1L); 1956 bzhiq(temp, temp, len); 1957 kmovql(dst, temp); 1958 } 1959 #endif // _LP64 1960 1961 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1962 reduce_operation_128(T_FLOAT, opcode, dst, src); 1963 pshufd(vtmp, src, 0x1); 1964 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1965 } 1966 1967 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1968 reduce2F(opcode, dst, src, vtmp); 1969 pshufd(vtmp, src, 0x2); 1970 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1971 pshufd(vtmp, src, 0x3); 1972 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1973 } 1974 1975 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1976 reduce4F(opcode, dst, src, vtmp2); 1977 vextractf128_high(vtmp2, src); 1978 reduce4F(opcode, dst, vtmp2, vtmp1); 1979 } 1980 1981 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1982 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1983 vextracti64x4_high(vtmp1, src); 1984 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 1985 } 1986 1987 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1988 reduce_operation_128(T_DOUBLE, opcode, dst, src); 1989 pshufd(vtmp, src, 0xE); 1990 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 1991 } 1992 1993 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1994 reduce2D(opcode, dst, src, vtmp2); 1995 vextractf128_high(vtmp2, src); 1996 reduce2D(opcode, dst, vtmp2, vtmp1); 1997 } 1998 1999 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2000 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2001 vextracti64x4_high(vtmp1, src); 2002 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2003 } 2004 2005 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) { 2006 MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len); 2007 } 2008 2009 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) { 2010 MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len); 2011 } 2012 2013 2014 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2015 XMMRegister dst, XMMRegister src, 2016 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2017 XMMRegister xmm_0, XMMRegister xmm_1) { 2018 int permconst[] = {1, 14}; 2019 XMMRegister wsrc = src; 2020 XMMRegister wdst = xmm_0; 2021 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2022 2023 int vlen_enc = Assembler::AVX_128bit; 2024 if (vlen == 16) { 2025 vlen_enc = Assembler::AVX_256bit; 2026 } 2027 2028 for (int i = log2(vlen) - 1; i >=0; i--) { 2029 if (i == 0 && !is_dst_valid) { 2030 wdst = dst; 2031 } 2032 if (i == 3) { 2033 vextracti64x4_high(wtmp, wsrc); 2034 } else if (i == 2) { 2035 vextracti128_high(wtmp, wsrc); 2036 } else { // i = [0,1] 2037 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2038 } 2039 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2040 wsrc = wdst; 2041 vlen_enc = Assembler::AVX_128bit; 2042 } 2043 if (is_dst_valid) { 2044 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2045 } 2046 } 2047 2048 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2049 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2050 XMMRegister xmm_0, XMMRegister xmm_1) { 2051 XMMRegister wsrc = src; 2052 XMMRegister wdst = xmm_0; 2053 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2054 int vlen_enc = Assembler::AVX_128bit; 2055 if (vlen == 8) { 2056 vlen_enc = Assembler::AVX_256bit; 2057 } 2058 for (int i = log2(vlen) - 1; i >=0; i--) { 2059 if (i == 0 && !is_dst_valid) { 2060 wdst = dst; 2061 } 2062 if (i == 1) { 2063 vextracti128_high(wtmp, wsrc); 2064 } else if (i == 2) { 2065 vextracti64x4_high(wtmp, wsrc); 2066 } else { 2067 assert(i == 0, "%d", i); 2068 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2069 } 2070 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2071 wsrc = wdst; 2072 vlen_enc = Assembler::AVX_128bit; 2073 } 2074 if (is_dst_valid) { 2075 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2076 } 2077 } 2078 2079 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2080 switch (bt) { 2081 case T_BYTE: pextrb(dst, src, idx); break; 2082 case T_SHORT: pextrw(dst, src, idx); break; 2083 case T_INT: pextrd(dst, src, idx); break; 2084 case T_LONG: pextrq(dst, src, idx); break; 2085 2086 default: 2087 assert(false,"Should not reach here."); 2088 break; 2089 } 2090 } 2091 2092 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2093 int esize = type2aelembytes(typ); 2094 int elem_per_lane = 16/esize; 2095 int lane = elemindex / elem_per_lane; 2096 int eindex = elemindex % elem_per_lane; 2097 2098 if (lane >= 2) { 2099 assert(UseAVX > 2, "required"); 2100 vextractf32x4(dst, src, lane & 3); 2101 return dst; 2102 } else if (lane > 0) { 2103 assert(UseAVX > 0, "required"); 2104 vextractf128(dst, src, lane); 2105 return dst; 2106 } else { 2107 return src; 2108 } 2109 } 2110 2111 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2112 int esize = type2aelembytes(typ); 2113 int elem_per_lane = 16/esize; 2114 int eindex = elemindex % elem_per_lane; 2115 assert(is_integral_type(typ),"required"); 2116 2117 if (eindex == 0) { 2118 if (typ == T_LONG) { 2119 movq(dst, src); 2120 } else { 2121 movdl(dst, src); 2122 if (typ == T_BYTE) 2123 movsbl(dst, dst); 2124 else if (typ == T_SHORT) 2125 movswl(dst, dst); 2126 } 2127 } else { 2128 extract(typ, dst, src, eindex); 2129 } 2130 } 2131 2132 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) { 2133 int esize = type2aelembytes(typ); 2134 int elem_per_lane = 16/esize; 2135 int eindex = elemindex % elem_per_lane; 2136 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2137 2138 if (eindex == 0) { 2139 movq(dst, src); 2140 } else { 2141 if (typ == T_FLOAT) { 2142 if (UseAVX == 0) { 2143 movdqu(dst, src); 2144 pshufps(dst, dst, eindex); 2145 } else { 2146 vpshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2147 } 2148 } else { 2149 if (UseAVX == 0) { 2150 movdqu(dst, src); 2151 psrldq(dst, eindex*esize); 2152 } else { 2153 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2154 } 2155 movq(dst, dst); 2156 } 2157 } 2158 // Zero upper bits 2159 if (typ == T_FLOAT) { 2160 if (UseAVX == 0) { 2161 assert((vtmp != xnoreg) && (tmp != noreg), "required."); 2162 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp); 2163 pand(dst, vtmp); 2164 } else { 2165 assert((tmp != noreg), "required."); 2166 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp); 2167 } 2168 } 2169 } 2170 2171 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2172 switch(typ) { 2173 case T_BYTE: 2174 case T_BOOLEAN: 2175 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2176 break; 2177 case T_SHORT: 2178 case T_CHAR: 2179 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2180 break; 2181 case T_INT: 2182 case T_FLOAT: 2183 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2184 break; 2185 case T_LONG: 2186 case T_DOUBLE: 2187 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2188 break; 2189 default: 2190 assert(false,"Should not reach here."); 2191 break; 2192 } 2193 } 2194 2195 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) { 2196 switch(typ) { 2197 case T_BOOLEAN: 2198 case T_BYTE: 2199 evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2200 break; 2201 case T_CHAR: 2202 case T_SHORT: 2203 evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2204 break; 2205 case T_INT: 2206 case T_FLOAT: 2207 evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2208 break; 2209 case T_LONG: 2210 case T_DOUBLE: 2211 evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2212 break; 2213 default: 2214 assert(false,"Should not reach here."); 2215 break; 2216 } 2217 } 2218 2219 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2220 switch(typ) { 2221 case T_BYTE: 2222 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2223 break; 2224 case T_SHORT: 2225 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2226 break; 2227 case T_INT: 2228 case T_FLOAT: 2229 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2230 break; 2231 case T_LONG: 2232 case T_DOUBLE: 2233 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2234 break; 2235 default: 2236 assert(false,"Should not reach here."); 2237 break; 2238 } 2239 } 2240 2241 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2, 2242 XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) { 2243 switch(vlen) { 2244 case 4: 2245 assert(vtmp1 != xnoreg, "required."); 2246 // Broadcast lower 32 bits to 128 bits before ptest 2247 pshufd(vtmp1, src1, 0x0); 2248 if (bt == BoolTest::overflow) { 2249 assert(vtmp2 != xnoreg, "required."); 2250 pshufd(vtmp2, src2, 0x0); 2251 } else { 2252 assert(vtmp2 == xnoreg, "required."); 2253 vtmp2 = src2; 2254 } 2255 ptest(vtmp1, vtmp2); 2256 break; 2257 case 8: 2258 assert(vtmp1 != xnoreg, "required."); 2259 // Broadcast lower 64 bits to 128 bits before ptest 2260 pshufd(vtmp1, src1, 0x4); 2261 if (bt == BoolTest::overflow) { 2262 assert(vtmp2 != xnoreg, "required."); 2263 pshufd(vtmp2, src2, 0x4); 2264 } else { 2265 assert(vtmp2 == xnoreg, "required."); 2266 vtmp2 = src2; 2267 } 2268 ptest(vtmp1, vtmp2); 2269 break; 2270 case 16: 2271 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2272 ptest(src1, src2); 2273 break; 2274 case 32: 2275 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2276 vptest(src1, src2, Assembler::AVX_256bit); 2277 break; 2278 case 64: 2279 { 2280 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2281 evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit); 2282 if (bt == BoolTest::ne) { 2283 ktestql(mask, mask); 2284 } else { 2285 assert(bt == BoolTest::overflow, "required"); 2286 kortestql(mask, mask); 2287 } 2288 } 2289 break; 2290 default: 2291 assert(false,"Should not reach here."); 2292 break; 2293 } 2294 } 2295 2296 //------------------------------------------------------------------------------------------- 2297 2298 // IndexOf for constant substrings with size >= 8 chars 2299 // which don't need to be loaded through stack. 2300 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2301 Register cnt1, Register cnt2, 2302 int int_cnt2, Register result, 2303 XMMRegister vec, Register tmp, 2304 int ae) { 2305 ShortBranchVerifier sbv(this); 2306 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2307 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2308 2309 // This method uses the pcmpestri instruction with bound registers 2310 // inputs: 2311 // xmm - substring 2312 // rax - substring length (elements count) 2313 // mem - scanned string 2314 // rdx - string length (elements count) 2315 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2316 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2317 // outputs: 2318 // rcx - matched index in string 2319 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2320 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2321 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2322 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2323 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2324 2325 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2326 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2327 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2328 2329 // Note, inline_string_indexOf() generates checks: 2330 // if (substr.count > string.count) return -1; 2331 // if (substr.count == 0) return 0; 2332 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2333 2334 // Load substring. 2335 if (ae == StrIntrinsicNode::UL) { 2336 pmovzxbw(vec, Address(str2, 0)); 2337 } else { 2338 movdqu(vec, Address(str2, 0)); 2339 } 2340 movl(cnt2, int_cnt2); 2341 movptr(result, str1); // string addr 2342 2343 if (int_cnt2 > stride) { 2344 jmpb(SCAN_TO_SUBSTR); 2345 2346 // Reload substr for rescan, this code 2347 // is executed only for large substrings (> 8 chars) 2348 bind(RELOAD_SUBSTR); 2349 if (ae == StrIntrinsicNode::UL) { 2350 pmovzxbw(vec, Address(str2, 0)); 2351 } else { 2352 movdqu(vec, Address(str2, 0)); 2353 } 2354 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2355 2356 bind(RELOAD_STR); 2357 // We came here after the beginning of the substring was 2358 // matched but the rest of it was not so we need to search 2359 // again. Start from the next element after the previous match. 2360 2361 // cnt2 is number of substring reminding elements and 2362 // cnt1 is number of string reminding elements when cmp failed. 2363 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2364 subl(cnt1, cnt2); 2365 addl(cnt1, int_cnt2); 2366 movl(cnt2, int_cnt2); // Now restore cnt2 2367 2368 decrementl(cnt1); // Shift to next element 2369 cmpl(cnt1, cnt2); 2370 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2371 2372 addptr(result, (1<<scale1)); 2373 2374 } // (int_cnt2 > 8) 2375 2376 // Scan string for start of substr in 16-byte vectors 2377 bind(SCAN_TO_SUBSTR); 2378 pcmpestri(vec, Address(result, 0), mode); 2379 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2380 subl(cnt1, stride); 2381 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2382 cmpl(cnt1, cnt2); 2383 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2384 addptr(result, 16); 2385 jmpb(SCAN_TO_SUBSTR); 2386 2387 // Found a potential substr 2388 bind(FOUND_CANDIDATE); 2389 // Matched whole vector if first element matched (tmp(rcx) == 0). 2390 if (int_cnt2 == stride) { 2391 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2392 } else { // int_cnt2 > 8 2393 jccb(Assembler::overflow, FOUND_SUBSTR); 2394 } 2395 // After pcmpestri tmp(rcx) contains matched element index 2396 // Compute start addr of substr 2397 lea(result, Address(result, tmp, scale1)); 2398 2399 // Make sure string is still long enough 2400 subl(cnt1, tmp); 2401 cmpl(cnt1, cnt2); 2402 if (int_cnt2 == stride) { 2403 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2404 } else { // int_cnt2 > 8 2405 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2406 } 2407 // Left less then substring. 2408 2409 bind(RET_NOT_FOUND); 2410 movl(result, -1); 2411 jmp(EXIT); 2412 2413 if (int_cnt2 > stride) { 2414 // This code is optimized for the case when whole substring 2415 // is matched if its head is matched. 2416 bind(MATCH_SUBSTR_HEAD); 2417 pcmpestri(vec, Address(result, 0), mode); 2418 // Reload only string if does not match 2419 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2420 2421 Label CONT_SCAN_SUBSTR; 2422 // Compare the rest of substring (> 8 chars). 2423 bind(FOUND_SUBSTR); 2424 // First 8 chars are already matched. 2425 negptr(cnt2); 2426 addptr(cnt2, stride); 2427 2428 bind(SCAN_SUBSTR); 2429 subl(cnt1, stride); 2430 cmpl(cnt2, -stride); // Do not read beyond substring 2431 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2432 // Back-up strings to avoid reading beyond substring: 2433 // cnt1 = cnt1 - cnt2 + 8 2434 addl(cnt1, cnt2); // cnt2 is negative 2435 addl(cnt1, stride); 2436 movl(cnt2, stride); negptr(cnt2); 2437 bind(CONT_SCAN_SUBSTR); 2438 if (int_cnt2 < (int)G) { 2439 int tail_off1 = int_cnt2<<scale1; 2440 int tail_off2 = int_cnt2<<scale2; 2441 if (ae == StrIntrinsicNode::UL) { 2442 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2443 } else { 2444 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2445 } 2446 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2447 } else { 2448 // calculate index in register to avoid integer overflow (int_cnt2*2) 2449 movl(tmp, int_cnt2); 2450 addptr(tmp, cnt2); 2451 if (ae == StrIntrinsicNode::UL) { 2452 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2453 } else { 2454 movdqu(vec, Address(str2, tmp, scale2, 0)); 2455 } 2456 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2457 } 2458 // Need to reload strings pointers if not matched whole vector 2459 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2460 addptr(cnt2, stride); 2461 jcc(Assembler::negative, SCAN_SUBSTR); 2462 // Fall through if found full substring 2463 2464 } // (int_cnt2 > 8) 2465 2466 bind(RET_FOUND); 2467 // Found result if we matched full small substring. 2468 // Compute substr offset 2469 subptr(result, str1); 2470 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2471 shrl(result, 1); // index 2472 } 2473 bind(EXIT); 2474 2475 } // string_indexofC8 2476 2477 // Small strings are loaded through stack if they cross page boundary. 2478 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2479 Register cnt1, Register cnt2, 2480 int int_cnt2, Register result, 2481 XMMRegister vec, Register tmp, 2482 int ae) { 2483 ShortBranchVerifier sbv(this); 2484 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2485 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2486 2487 // 2488 // int_cnt2 is length of small (< 8 chars) constant substring 2489 // or (-1) for non constant substring in which case its length 2490 // is in cnt2 register. 2491 // 2492 // Note, inline_string_indexOf() generates checks: 2493 // if (substr.count > string.count) return -1; 2494 // if (substr.count == 0) return 0; 2495 // 2496 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2497 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2498 // This method uses the pcmpestri instruction with bound registers 2499 // inputs: 2500 // xmm - substring 2501 // rax - substring length (elements count) 2502 // mem - scanned string 2503 // rdx - string length (elements count) 2504 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2505 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2506 // outputs: 2507 // rcx - matched index in string 2508 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2509 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2510 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2511 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2512 2513 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2514 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2515 FOUND_CANDIDATE; 2516 2517 { //======================================================== 2518 // We don't know where these strings are located 2519 // and we can't read beyond them. Load them through stack. 2520 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2521 2522 movptr(tmp, rsp); // save old SP 2523 2524 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2525 if (int_cnt2 == (1>>scale2)) { // One byte 2526 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2527 load_unsigned_byte(result, Address(str2, 0)); 2528 movdl(vec, result); // move 32 bits 2529 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2530 // Not enough header space in 32-bit VM: 12+3 = 15. 2531 movl(result, Address(str2, -1)); 2532 shrl(result, 8); 2533 movdl(vec, result); // move 32 bits 2534 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2535 load_unsigned_short(result, Address(str2, 0)); 2536 movdl(vec, result); // move 32 bits 2537 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2538 movdl(vec, Address(str2, 0)); // move 32 bits 2539 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2540 movq(vec, Address(str2, 0)); // move 64 bits 2541 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2542 // Array header size is 12 bytes in 32-bit VM 2543 // + 6 bytes for 3 chars == 18 bytes, 2544 // enough space to load vec and shift. 2545 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2546 if (ae == StrIntrinsicNode::UL) { 2547 int tail_off = int_cnt2-8; 2548 pmovzxbw(vec, Address(str2, tail_off)); 2549 psrldq(vec, -2*tail_off); 2550 } 2551 else { 2552 int tail_off = int_cnt2*(1<<scale2); 2553 movdqu(vec, Address(str2, tail_off-16)); 2554 psrldq(vec, 16-tail_off); 2555 } 2556 } 2557 } else { // not constant substring 2558 cmpl(cnt2, stride); 2559 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2560 2561 // We can read beyond string if srt+16 does not cross page boundary 2562 // since heaps are aligned and mapped by pages. 2563 assert(os::vm_page_size() < (int)G, "default page should be small"); 2564 movl(result, str2); // We need only low 32 bits 2565 andl(result, (os::vm_page_size()-1)); 2566 cmpl(result, (os::vm_page_size()-16)); 2567 jccb(Assembler::belowEqual, CHECK_STR); 2568 2569 // Move small strings to stack to allow load 16 bytes into vec. 2570 subptr(rsp, 16); 2571 int stk_offset = wordSize-(1<<scale2); 2572 push(cnt2); 2573 2574 bind(COPY_SUBSTR); 2575 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2576 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2577 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2578 } else if (ae == StrIntrinsicNode::UU) { 2579 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2580 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2581 } 2582 decrement(cnt2); 2583 jccb(Assembler::notZero, COPY_SUBSTR); 2584 2585 pop(cnt2); 2586 movptr(str2, rsp); // New substring address 2587 } // non constant 2588 2589 bind(CHECK_STR); 2590 cmpl(cnt1, stride); 2591 jccb(Assembler::aboveEqual, BIG_STRINGS); 2592 2593 // Check cross page boundary. 2594 movl(result, str1); // We need only low 32 bits 2595 andl(result, (os::vm_page_size()-1)); 2596 cmpl(result, (os::vm_page_size()-16)); 2597 jccb(Assembler::belowEqual, BIG_STRINGS); 2598 2599 subptr(rsp, 16); 2600 int stk_offset = -(1<<scale1); 2601 if (int_cnt2 < 0) { // not constant 2602 push(cnt2); 2603 stk_offset += wordSize; 2604 } 2605 movl(cnt2, cnt1); 2606 2607 bind(COPY_STR); 2608 if (ae == StrIntrinsicNode::LL) { 2609 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2610 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2611 } else { 2612 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2613 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2614 } 2615 decrement(cnt2); 2616 jccb(Assembler::notZero, COPY_STR); 2617 2618 if (int_cnt2 < 0) { // not constant 2619 pop(cnt2); 2620 } 2621 movptr(str1, rsp); // New string address 2622 2623 bind(BIG_STRINGS); 2624 // Load substring. 2625 if (int_cnt2 < 0) { // -1 2626 if (ae == StrIntrinsicNode::UL) { 2627 pmovzxbw(vec, Address(str2, 0)); 2628 } else { 2629 movdqu(vec, Address(str2, 0)); 2630 } 2631 push(cnt2); // substr count 2632 push(str2); // substr addr 2633 push(str1); // string addr 2634 } else { 2635 // Small (< 8 chars) constant substrings are loaded already. 2636 movl(cnt2, int_cnt2); 2637 } 2638 push(tmp); // original SP 2639 2640 } // Finished loading 2641 2642 //======================================================== 2643 // Start search 2644 // 2645 2646 movptr(result, str1); // string addr 2647 2648 if (int_cnt2 < 0) { // Only for non constant substring 2649 jmpb(SCAN_TO_SUBSTR); 2650 2651 // SP saved at sp+0 2652 // String saved at sp+1*wordSize 2653 // Substr saved at sp+2*wordSize 2654 // Substr count saved at sp+3*wordSize 2655 2656 // Reload substr for rescan, this code 2657 // is executed only for large substrings (> 8 chars) 2658 bind(RELOAD_SUBSTR); 2659 movptr(str2, Address(rsp, 2*wordSize)); 2660 movl(cnt2, Address(rsp, 3*wordSize)); 2661 if (ae == StrIntrinsicNode::UL) { 2662 pmovzxbw(vec, Address(str2, 0)); 2663 } else { 2664 movdqu(vec, Address(str2, 0)); 2665 } 2666 // We came here after the beginning of the substring was 2667 // matched but the rest of it was not so we need to search 2668 // again. Start from the next element after the previous match. 2669 subptr(str1, result); // Restore counter 2670 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2671 shrl(str1, 1); 2672 } 2673 addl(cnt1, str1); 2674 decrementl(cnt1); // Shift to next element 2675 cmpl(cnt1, cnt2); 2676 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2677 2678 addptr(result, (1<<scale1)); 2679 } // non constant 2680 2681 // Scan string for start of substr in 16-byte vectors 2682 bind(SCAN_TO_SUBSTR); 2683 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2684 pcmpestri(vec, Address(result, 0), mode); 2685 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2686 subl(cnt1, stride); 2687 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2688 cmpl(cnt1, cnt2); 2689 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2690 addptr(result, 16); 2691 2692 bind(ADJUST_STR); 2693 cmpl(cnt1, stride); // Do not read beyond string 2694 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2695 // Back-up string to avoid reading beyond string. 2696 lea(result, Address(result, cnt1, scale1, -16)); 2697 movl(cnt1, stride); 2698 jmpb(SCAN_TO_SUBSTR); 2699 2700 // Found a potential substr 2701 bind(FOUND_CANDIDATE); 2702 // After pcmpestri tmp(rcx) contains matched element index 2703 2704 // Make sure string is still long enough 2705 subl(cnt1, tmp); 2706 cmpl(cnt1, cnt2); 2707 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 2708 // Left less then substring. 2709 2710 bind(RET_NOT_FOUND); 2711 movl(result, -1); 2712 jmp(CLEANUP); 2713 2714 bind(FOUND_SUBSTR); 2715 // Compute start addr of substr 2716 lea(result, Address(result, tmp, scale1)); 2717 if (int_cnt2 > 0) { // Constant substring 2718 // Repeat search for small substring (< 8 chars) 2719 // from new point without reloading substring. 2720 // Have to check that we don't read beyond string. 2721 cmpl(tmp, stride-int_cnt2); 2722 jccb(Assembler::greater, ADJUST_STR); 2723 // Fall through if matched whole substring. 2724 } else { // non constant 2725 assert(int_cnt2 == -1, "should be != 0"); 2726 2727 addl(tmp, cnt2); 2728 // Found result if we matched whole substring. 2729 cmpl(tmp, stride); 2730 jcc(Assembler::lessEqual, RET_FOUND); 2731 2732 // Repeat search for small substring (<= 8 chars) 2733 // from new point 'str1' without reloading substring. 2734 cmpl(cnt2, stride); 2735 // Have to check that we don't read beyond string. 2736 jccb(Assembler::lessEqual, ADJUST_STR); 2737 2738 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 2739 // Compare the rest of substring (> 8 chars). 2740 movptr(str1, result); 2741 2742 cmpl(tmp, cnt2); 2743 // First 8 chars are already matched. 2744 jccb(Assembler::equal, CHECK_NEXT); 2745 2746 bind(SCAN_SUBSTR); 2747 pcmpestri(vec, Address(str1, 0), mode); 2748 // Need to reload strings pointers if not matched whole vector 2749 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2750 2751 bind(CHECK_NEXT); 2752 subl(cnt2, stride); 2753 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 2754 addptr(str1, 16); 2755 if (ae == StrIntrinsicNode::UL) { 2756 addptr(str2, 8); 2757 } else { 2758 addptr(str2, 16); 2759 } 2760 subl(cnt1, stride); 2761 cmpl(cnt2, stride); // Do not read beyond substring 2762 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 2763 // Back-up strings to avoid reading beyond substring. 2764 2765 if (ae == StrIntrinsicNode::UL) { 2766 lea(str2, Address(str2, cnt2, scale2, -8)); 2767 lea(str1, Address(str1, cnt2, scale1, -16)); 2768 } else { 2769 lea(str2, Address(str2, cnt2, scale2, -16)); 2770 lea(str1, Address(str1, cnt2, scale1, -16)); 2771 } 2772 subl(cnt1, cnt2); 2773 movl(cnt2, stride); 2774 addl(cnt1, stride); 2775 bind(CONT_SCAN_SUBSTR); 2776 if (ae == StrIntrinsicNode::UL) { 2777 pmovzxbw(vec, Address(str2, 0)); 2778 } else { 2779 movdqu(vec, Address(str2, 0)); 2780 } 2781 jmp(SCAN_SUBSTR); 2782 2783 bind(RET_FOUND_LONG); 2784 movptr(str1, Address(rsp, wordSize)); 2785 } // non constant 2786 2787 bind(RET_FOUND); 2788 // Compute substr offset 2789 subptr(result, str1); 2790 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2791 shrl(result, 1); // index 2792 } 2793 bind(CLEANUP); 2794 pop(rsp); // restore SP 2795 2796 } // string_indexof 2797 2798 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 2799 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 2800 ShortBranchVerifier sbv(this); 2801 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2802 2803 int stride = 8; 2804 2805 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 2806 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 2807 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 2808 FOUND_SEQ_CHAR, DONE_LABEL; 2809 2810 movptr(result, str1); 2811 if (UseAVX >= 2) { 2812 cmpl(cnt1, stride); 2813 jcc(Assembler::less, SCAN_TO_CHAR); 2814 cmpl(cnt1, 2*stride); 2815 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 2816 movdl(vec1, ch); 2817 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 2818 vpxor(vec2, vec2); 2819 movl(tmp, cnt1); 2820 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 2821 andl(cnt1,0x0000000F); //tail count (in chars) 2822 2823 bind(SCAN_TO_16_CHAR_LOOP); 2824 vmovdqu(vec3, Address(result, 0)); 2825 vpcmpeqw(vec3, vec3, vec1, 1); 2826 vptest(vec2, vec3); 2827 jcc(Assembler::carryClear, FOUND_CHAR); 2828 addptr(result, 32); 2829 subl(tmp, 2*stride); 2830 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 2831 jmp(SCAN_TO_8_CHAR); 2832 bind(SCAN_TO_8_CHAR_INIT); 2833 movdl(vec1, ch); 2834 pshuflw(vec1, vec1, 0x00); 2835 pshufd(vec1, vec1, 0); 2836 pxor(vec2, vec2); 2837 } 2838 bind(SCAN_TO_8_CHAR); 2839 cmpl(cnt1, stride); 2840 jcc(Assembler::less, SCAN_TO_CHAR); 2841 if (UseAVX < 2) { 2842 movdl(vec1, ch); 2843 pshuflw(vec1, vec1, 0x00); 2844 pshufd(vec1, vec1, 0); 2845 pxor(vec2, vec2); 2846 } 2847 movl(tmp, cnt1); 2848 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 2849 andl(cnt1,0x00000007); //tail count (in chars) 2850 2851 bind(SCAN_TO_8_CHAR_LOOP); 2852 movdqu(vec3, Address(result, 0)); 2853 pcmpeqw(vec3, vec1); 2854 ptest(vec2, vec3); 2855 jcc(Assembler::carryClear, FOUND_CHAR); 2856 addptr(result, 16); 2857 subl(tmp, stride); 2858 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 2859 bind(SCAN_TO_CHAR); 2860 testl(cnt1, cnt1); 2861 jcc(Assembler::zero, RET_NOT_FOUND); 2862 bind(SCAN_TO_CHAR_LOOP); 2863 load_unsigned_short(tmp, Address(result, 0)); 2864 cmpl(ch, tmp); 2865 jccb(Assembler::equal, FOUND_SEQ_CHAR); 2866 addptr(result, 2); 2867 subl(cnt1, 1); 2868 jccb(Assembler::zero, RET_NOT_FOUND); 2869 jmp(SCAN_TO_CHAR_LOOP); 2870 2871 bind(RET_NOT_FOUND); 2872 movl(result, -1); 2873 jmpb(DONE_LABEL); 2874 2875 bind(FOUND_CHAR); 2876 if (UseAVX >= 2) { 2877 vpmovmskb(tmp, vec3); 2878 } else { 2879 pmovmskb(tmp, vec3); 2880 } 2881 bsfl(ch, tmp); 2882 addptr(result, ch); 2883 2884 bind(FOUND_SEQ_CHAR); 2885 subptr(result, str1); 2886 shrl(result, 1); 2887 2888 bind(DONE_LABEL); 2889 } // string_indexof_char 2890 2891 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 2892 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 2893 ShortBranchVerifier sbv(this); 2894 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2895 2896 int stride = 16; 2897 2898 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 2899 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 2900 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 2901 FOUND_SEQ_CHAR, DONE_LABEL; 2902 2903 movptr(result, str1); 2904 if (UseAVX >= 2) { 2905 cmpl(cnt1, stride); 2906 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 2907 cmpl(cnt1, stride*2); 2908 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 2909 movdl(vec1, ch); 2910 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 2911 vpxor(vec2, vec2); 2912 movl(tmp, cnt1); 2913 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 2914 andl(cnt1,0x0000001F); //tail count (in chars) 2915 2916 bind(SCAN_TO_32_CHAR_LOOP); 2917 vmovdqu(vec3, Address(result, 0)); 2918 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 2919 vptest(vec2, vec3); 2920 jcc(Assembler::carryClear, FOUND_CHAR); 2921 addptr(result, 32); 2922 subl(tmp, stride*2); 2923 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 2924 jmp(SCAN_TO_16_CHAR); 2925 2926 bind(SCAN_TO_16_CHAR_INIT); 2927 movdl(vec1, ch); 2928 pxor(vec2, vec2); 2929 pshufb(vec1, vec2); 2930 } 2931 2932 bind(SCAN_TO_16_CHAR); 2933 cmpl(cnt1, stride); 2934 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left 2935 if (UseAVX < 2) { 2936 movdl(vec1, ch); 2937 pxor(vec2, vec2); 2938 pshufb(vec1, vec2); 2939 } 2940 movl(tmp, cnt1); 2941 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 2942 andl(cnt1,0x0000000F); //tail count (in bytes) 2943 2944 bind(SCAN_TO_16_CHAR_LOOP); 2945 movdqu(vec3, Address(result, 0)); 2946 pcmpeqb(vec3, vec1); 2947 ptest(vec2, vec3); 2948 jcc(Assembler::carryClear, FOUND_CHAR); 2949 addptr(result, 16); 2950 subl(tmp, stride); 2951 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 2952 2953 bind(SCAN_TO_CHAR_INIT); 2954 testl(cnt1, cnt1); 2955 jcc(Assembler::zero, RET_NOT_FOUND); 2956 bind(SCAN_TO_CHAR_LOOP); 2957 load_unsigned_byte(tmp, Address(result, 0)); 2958 cmpl(ch, tmp); 2959 jccb(Assembler::equal, FOUND_SEQ_CHAR); 2960 addptr(result, 1); 2961 subl(cnt1, 1); 2962 jccb(Assembler::zero, RET_NOT_FOUND); 2963 jmp(SCAN_TO_CHAR_LOOP); 2964 2965 bind(RET_NOT_FOUND); 2966 movl(result, -1); 2967 jmpb(DONE_LABEL); 2968 2969 bind(FOUND_CHAR); 2970 if (UseAVX >= 2) { 2971 vpmovmskb(tmp, vec3); 2972 } else { 2973 pmovmskb(tmp, vec3); 2974 } 2975 bsfl(ch, tmp); 2976 addptr(result, ch); 2977 2978 bind(FOUND_SEQ_CHAR); 2979 subptr(result, str1); 2980 2981 bind(DONE_LABEL); 2982 } // stringL_indexof_char 2983 2984 // helper function for string_compare 2985 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 2986 Address::ScaleFactor scale, Address::ScaleFactor scale1, 2987 Address::ScaleFactor scale2, Register index, int ae) { 2988 if (ae == StrIntrinsicNode::LL) { 2989 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 2990 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 2991 } else if (ae == StrIntrinsicNode::UU) { 2992 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 2993 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 2994 } else { 2995 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 2996 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 2997 } 2998 } 2999 3000 // Compare strings, used for char[] and byte[]. 3001 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3002 Register cnt1, Register cnt2, Register result, 3003 XMMRegister vec1, int ae, KRegister mask) { 3004 ShortBranchVerifier sbv(this); 3005 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3006 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3007 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3008 int stride2x2 = 0x40; 3009 Address::ScaleFactor scale = Address::no_scale; 3010 Address::ScaleFactor scale1 = Address::no_scale; 3011 Address::ScaleFactor scale2 = Address::no_scale; 3012 3013 if (ae != StrIntrinsicNode::LL) { 3014 stride2x2 = 0x20; 3015 } 3016 3017 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3018 shrl(cnt2, 1); 3019 } 3020 // Compute the minimum of the string lengths and the 3021 // difference of the string lengths (stack). 3022 // Do the conditional move stuff 3023 movl(result, cnt1); 3024 subl(cnt1, cnt2); 3025 push(cnt1); 3026 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3027 3028 // Is the minimum length zero? 3029 testl(cnt2, cnt2); 3030 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3031 if (ae == StrIntrinsicNode::LL) { 3032 // Load first bytes 3033 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3034 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3035 } else if (ae == StrIntrinsicNode::UU) { 3036 // Load first characters 3037 load_unsigned_short(result, Address(str1, 0)); 3038 load_unsigned_short(cnt1, Address(str2, 0)); 3039 } else { 3040 load_unsigned_byte(result, Address(str1, 0)); 3041 load_unsigned_short(cnt1, Address(str2, 0)); 3042 } 3043 subl(result, cnt1); 3044 jcc(Assembler::notZero, POP_LABEL); 3045 3046 if (ae == StrIntrinsicNode::UU) { 3047 // Divide length by 2 to get number of chars 3048 shrl(cnt2, 1); 3049 } 3050 cmpl(cnt2, 1); 3051 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3052 3053 // Check if the strings start at the same location and setup scale and stride 3054 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3055 cmpptr(str1, str2); 3056 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3057 if (ae == StrIntrinsicNode::LL) { 3058 scale = Address::times_1; 3059 stride = 16; 3060 } else { 3061 scale = Address::times_2; 3062 stride = 8; 3063 } 3064 } else { 3065 scale1 = Address::times_1; 3066 scale2 = Address::times_2; 3067 // scale not used 3068 stride = 8; 3069 } 3070 3071 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3072 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3073 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3074 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3075 Label COMPARE_TAIL_LONG; 3076 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3077 3078 int pcmpmask = 0x19; 3079 if (ae == StrIntrinsicNode::LL) { 3080 pcmpmask &= ~0x01; 3081 } 3082 3083 // Setup to compare 16-chars (32-bytes) vectors, 3084 // start from first character again because it has aligned address. 3085 if (ae == StrIntrinsicNode::LL) { 3086 stride2 = 32; 3087 } else { 3088 stride2 = 16; 3089 } 3090 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3091 adr_stride = stride << scale; 3092 } else { 3093 adr_stride1 = 8; //stride << scale1; 3094 adr_stride2 = 16; //stride << scale2; 3095 } 3096 3097 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3098 // rax and rdx are used by pcmpestri as elements counters 3099 movl(result, cnt2); 3100 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3101 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3102 3103 // fast path : compare first 2 8-char vectors. 3104 bind(COMPARE_16_CHARS); 3105 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3106 movdqu(vec1, Address(str1, 0)); 3107 } else { 3108 pmovzxbw(vec1, Address(str1, 0)); 3109 } 3110 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3111 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3112 3113 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3114 movdqu(vec1, Address(str1, adr_stride)); 3115 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3116 } else { 3117 pmovzxbw(vec1, Address(str1, adr_stride1)); 3118 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3119 } 3120 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3121 addl(cnt1, stride); 3122 3123 // Compare the characters at index in cnt1 3124 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3125 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3126 subl(result, cnt2); 3127 jmp(POP_LABEL); 3128 3129 // Setup the registers to start vector comparison loop 3130 bind(COMPARE_WIDE_VECTORS); 3131 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3132 lea(str1, Address(str1, result, scale)); 3133 lea(str2, Address(str2, result, scale)); 3134 } else { 3135 lea(str1, Address(str1, result, scale1)); 3136 lea(str2, Address(str2, result, scale2)); 3137 } 3138 subl(result, stride2); 3139 subl(cnt2, stride2); 3140 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3141 negptr(result); 3142 3143 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3144 bind(COMPARE_WIDE_VECTORS_LOOP); 3145 3146 #ifdef _LP64 3147 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3148 cmpl(cnt2, stride2x2); 3149 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3150 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3151 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3152 3153 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3154 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3155 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3156 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3157 } else { 3158 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3159 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3160 } 3161 kortestql(mask, mask); 3162 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3163 addptr(result, stride2x2); // update since we already compared at this addr 3164 subl(cnt2, stride2x2); // and sub the size too 3165 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3166 3167 vpxor(vec1, vec1); 3168 jmpb(COMPARE_WIDE_TAIL); 3169 }//if (VM_Version::supports_avx512vlbw()) 3170 #endif // _LP64 3171 3172 3173 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3174 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3175 vmovdqu(vec1, Address(str1, result, scale)); 3176 vpxor(vec1, Address(str2, result, scale)); 3177 } else { 3178 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3179 vpxor(vec1, Address(str2, result, scale2)); 3180 } 3181 vptest(vec1, vec1); 3182 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3183 addptr(result, stride2); 3184 subl(cnt2, stride2); 3185 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3186 // clean upper bits of YMM registers 3187 vpxor(vec1, vec1); 3188 3189 // compare wide vectors tail 3190 bind(COMPARE_WIDE_TAIL); 3191 testptr(result, result); 3192 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3193 3194 movl(result, stride2); 3195 movl(cnt2, result); 3196 negptr(result); 3197 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3198 3199 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3200 bind(VECTOR_NOT_EQUAL); 3201 // clean upper bits of YMM registers 3202 vpxor(vec1, vec1); 3203 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3204 lea(str1, Address(str1, result, scale)); 3205 lea(str2, Address(str2, result, scale)); 3206 } else { 3207 lea(str1, Address(str1, result, scale1)); 3208 lea(str2, Address(str2, result, scale2)); 3209 } 3210 jmp(COMPARE_16_CHARS); 3211 3212 // Compare tail chars, length between 1 to 15 chars 3213 bind(COMPARE_TAIL_LONG); 3214 movl(cnt2, result); 3215 cmpl(cnt2, stride); 3216 jcc(Assembler::less, COMPARE_SMALL_STR); 3217 3218 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3219 movdqu(vec1, Address(str1, 0)); 3220 } else { 3221 pmovzxbw(vec1, Address(str1, 0)); 3222 } 3223 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3224 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3225 subptr(cnt2, stride); 3226 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3227 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3228 lea(str1, Address(str1, result, scale)); 3229 lea(str2, Address(str2, result, scale)); 3230 } else { 3231 lea(str1, Address(str1, result, scale1)); 3232 lea(str2, Address(str2, result, scale2)); 3233 } 3234 negptr(cnt2); 3235 jmpb(WHILE_HEAD_LABEL); 3236 3237 bind(COMPARE_SMALL_STR); 3238 } else if (UseSSE42Intrinsics) { 3239 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3240 int pcmpmask = 0x19; 3241 // Setup to compare 8-char (16-byte) vectors, 3242 // start from first character again because it has aligned address. 3243 movl(result, cnt2); 3244 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3245 if (ae == StrIntrinsicNode::LL) { 3246 pcmpmask &= ~0x01; 3247 } 3248 jcc(Assembler::zero, COMPARE_TAIL); 3249 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3250 lea(str1, Address(str1, result, scale)); 3251 lea(str2, Address(str2, result, scale)); 3252 } else { 3253 lea(str1, Address(str1, result, scale1)); 3254 lea(str2, Address(str2, result, scale2)); 3255 } 3256 negptr(result); 3257 3258 // pcmpestri 3259 // inputs: 3260 // vec1- substring 3261 // rax - negative string length (elements count) 3262 // mem - scanned string 3263 // rdx - string length (elements count) 3264 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3265 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3266 // outputs: 3267 // rcx - first mismatched element index 3268 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3269 3270 bind(COMPARE_WIDE_VECTORS); 3271 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3272 movdqu(vec1, Address(str1, result, scale)); 3273 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3274 } else { 3275 pmovzxbw(vec1, Address(str1, result, scale1)); 3276 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3277 } 3278 // After pcmpestri cnt1(rcx) contains mismatched element index 3279 3280 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3281 addptr(result, stride); 3282 subptr(cnt2, stride); 3283 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3284 3285 // compare wide vectors tail 3286 testptr(result, result); 3287 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3288 3289 movl(cnt2, stride); 3290 movl(result, stride); 3291 negptr(result); 3292 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3293 movdqu(vec1, Address(str1, result, scale)); 3294 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3295 } else { 3296 pmovzxbw(vec1, Address(str1, result, scale1)); 3297 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3298 } 3299 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3300 3301 // Mismatched characters in the vectors 3302 bind(VECTOR_NOT_EQUAL); 3303 addptr(cnt1, result); 3304 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3305 subl(result, cnt2); 3306 jmpb(POP_LABEL); 3307 3308 bind(COMPARE_TAIL); // limit is zero 3309 movl(cnt2, result); 3310 // Fallthru to tail compare 3311 } 3312 // Shift str2 and str1 to the end of the arrays, negate min 3313 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3314 lea(str1, Address(str1, cnt2, scale)); 3315 lea(str2, Address(str2, cnt2, scale)); 3316 } else { 3317 lea(str1, Address(str1, cnt2, scale1)); 3318 lea(str2, Address(str2, cnt2, scale2)); 3319 } 3320 decrementl(cnt2); // first character was compared already 3321 negptr(cnt2); 3322 3323 // Compare the rest of the elements 3324 bind(WHILE_HEAD_LABEL); 3325 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3326 subl(result, cnt1); 3327 jccb(Assembler::notZero, POP_LABEL); 3328 increment(cnt2); 3329 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3330 3331 // Strings are equal up to min length. Return the length difference. 3332 bind(LENGTH_DIFF_LABEL); 3333 pop(result); 3334 if (ae == StrIntrinsicNode::UU) { 3335 // Divide diff by 2 to get number of chars 3336 sarl(result, 1); 3337 } 3338 jmpb(DONE_LABEL); 3339 3340 #ifdef _LP64 3341 if (VM_Version::supports_avx512vlbw()) { 3342 3343 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3344 3345 kmovql(cnt1, mask); 3346 notq(cnt1); 3347 bsfq(cnt2, cnt1); 3348 if (ae != StrIntrinsicNode::LL) { 3349 // Divide diff by 2 to get number of chars 3350 sarl(cnt2, 1); 3351 } 3352 addq(result, cnt2); 3353 if (ae == StrIntrinsicNode::LL) { 3354 load_unsigned_byte(cnt1, Address(str2, result)); 3355 load_unsigned_byte(result, Address(str1, result)); 3356 } else if (ae == StrIntrinsicNode::UU) { 3357 load_unsigned_short(cnt1, Address(str2, result, scale)); 3358 load_unsigned_short(result, Address(str1, result, scale)); 3359 } else { 3360 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3361 load_unsigned_byte(result, Address(str1, result, scale1)); 3362 } 3363 subl(result, cnt1); 3364 jmpb(POP_LABEL); 3365 }//if (VM_Version::supports_avx512vlbw()) 3366 #endif // _LP64 3367 3368 // Discard the stored length difference 3369 bind(POP_LABEL); 3370 pop(cnt1); 3371 3372 // That's it 3373 bind(DONE_LABEL); 3374 if(ae == StrIntrinsicNode::UL) { 3375 negl(result); 3376 } 3377 3378 } 3379 3380 // Search for Non-ASCII character (Negative byte value) in a byte array, 3381 // return the index of the first such character, otherwise the length 3382 // of the array segment searched. 3383 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3384 // @IntrinsicCandidate 3385 // public static int countPositives(byte[] ba, int off, int len) { 3386 // for (int i = off; i < off + len; i++) { 3387 // if (ba[i] < 0) { 3388 // return i - off; 3389 // } 3390 // } 3391 // return len; 3392 // } 3393 void C2_MacroAssembler::count_positives(Register ary1, Register len, 3394 Register result, Register tmp1, 3395 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3396 // rsi: byte array 3397 // rcx: len 3398 // rax: result 3399 ShortBranchVerifier sbv(this); 3400 assert_different_registers(ary1, len, result, tmp1); 3401 assert_different_registers(vec1, vec2); 3402 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3403 3404 movl(result, len); // copy 3405 // len == 0 3406 testl(len, len); 3407 jcc(Assembler::zero, DONE); 3408 3409 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3410 VM_Version::supports_avx512vlbw() && 3411 VM_Version::supports_bmi2()) { 3412 3413 Label test_64_loop, test_tail, BREAK_LOOP; 3414 Register tmp3_aliased = len; 3415 3416 movl(tmp1, len); 3417 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3418 3419 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F 3420 andl(len, ~(64 - 1)); // vector count (in chars) 3421 jccb(Assembler::zero, test_tail); 3422 3423 lea(ary1, Address(ary1, len, Address::times_1)); 3424 negptr(len); 3425 3426 bind(test_64_loop); 3427 // Check whether our 64 elements of size byte contain negatives 3428 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3429 kortestql(mask1, mask1); 3430 jcc(Assembler::notZero, BREAK_LOOP); 3431 3432 addptr(len, 64); 3433 jccb(Assembler::notZero, test_64_loop); 3434 3435 bind(test_tail); 3436 // bail out when there is nothing to be done 3437 testl(tmp1, -1); 3438 jcc(Assembler::zero, DONE); 3439 3440 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3441 #ifdef _LP64 3442 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3443 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3444 notq(tmp3_aliased); 3445 kmovql(mask2, tmp3_aliased); 3446 #else 3447 Label k_init; 3448 jmp(k_init); 3449 3450 // We could not read 64-bits from a general purpose register thus we move 3451 // data required to compose 64 1's to the instruction stream 3452 // We emit 64 byte wide series of elements from 0..63 which later on would 3453 // be used as a compare targets with tail count contained in tmp1 register. 3454 // Result would be a k register having tmp1 consecutive number or 1 3455 // counting from least significant bit. 3456 address tmp = pc(); 3457 emit_int64(0x0706050403020100); 3458 emit_int64(0x0F0E0D0C0B0A0908); 3459 emit_int64(0x1716151413121110); 3460 emit_int64(0x1F1E1D1C1B1A1918); 3461 emit_int64(0x2726252423222120); 3462 emit_int64(0x2F2E2D2C2B2A2928); 3463 emit_int64(0x3736353433323130); 3464 emit_int64(0x3F3E3D3C3B3A3938); 3465 3466 bind(k_init); 3467 lea(len, InternalAddress(tmp)); 3468 // create mask to test for negative byte inside a vector 3469 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3470 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 3471 3472 #endif 3473 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3474 ktestq(mask1, mask2); 3475 jcc(Assembler::zero, DONE); 3476 3477 bind(BREAK_LOOP); 3478 // At least one byte in the last 64 bytes is negative. 3479 // Set up to look at the last 64 bytes as if they were a tail 3480 lea(ary1, Address(ary1, len, Address::times_1)); 3481 addptr(result, len); 3482 // Ignore the very last byte: if all others are positive, 3483 // it must be negative, so we can skip right to the 2+1 byte 3484 // end comparison at this point 3485 orl(result, 63); 3486 movl(len, 63); 3487 // Fallthru to tail compare 3488 } else { 3489 3490 if (UseAVX >= 2 && UseSSE >= 2) { 3491 // With AVX2, use 32-byte vector compare 3492 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 3493 3494 // Compare 32-byte vectors 3495 testl(len, 0xffffffe0); // vector count (in bytes) 3496 jccb(Assembler::zero, TAIL_START); 3497 3498 andl(len, 0xffffffe0); 3499 lea(ary1, Address(ary1, len, Address::times_1)); 3500 negptr(len); 3501 3502 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3503 movdl(vec2, tmp1); 3504 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 3505 3506 bind(COMPARE_WIDE_VECTORS); 3507 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 3508 vptest(vec1, vec2); 3509 jccb(Assembler::notZero, BREAK_LOOP); 3510 addptr(len, 32); 3511 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3512 3513 testl(result, 0x0000001f); // any bytes remaining? 3514 jcc(Assembler::zero, DONE); 3515 3516 // Quick test using the already prepared vector mask 3517 movl(len, result); 3518 andl(len, 0x0000001f); 3519 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 3520 vptest(vec1, vec2); 3521 jcc(Assembler::zero, DONE); 3522 // There are zeros, jump to the tail to determine exactly where 3523 jmpb(TAIL_START); 3524 3525 bind(BREAK_LOOP); 3526 // At least one byte in the last 32-byte vector is negative. 3527 // Set up to look at the last 32 bytes as if they were a tail 3528 lea(ary1, Address(ary1, len, Address::times_1)); 3529 addptr(result, len); 3530 // Ignore the very last byte: if all others are positive, 3531 // it must be negative, so we can skip right to the 2+1 byte 3532 // end comparison at this point 3533 orl(result, 31); 3534 movl(len, 31); 3535 // Fallthru to tail compare 3536 } else if (UseSSE42Intrinsics) { 3537 // With SSE4.2, use double quad vector compare 3538 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 3539 3540 // Compare 16-byte vectors 3541 testl(len, 0xfffffff0); // vector count (in bytes) 3542 jcc(Assembler::zero, TAIL_START); 3543 3544 andl(len, 0xfffffff0); 3545 lea(ary1, Address(ary1, len, Address::times_1)); 3546 negptr(len); 3547 3548 movl(tmp1, 0x80808080); 3549 movdl(vec2, tmp1); 3550 pshufd(vec2, vec2, 0); 3551 3552 bind(COMPARE_WIDE_VECTORS); 3553 movdqu(vec1, Address(ary1, len, Address::times_1)); 3554 ptest(vec1, vec2); 3555 jccb(Assembler::notZero, BREAK_LOOP); 3556 addptr(len, 16); 3557 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3558 3559 testl(result, 0x0000000f); // len is zero, any bytes remaining? 3560 jcc(Assembler::zero, DONE); 3561 3562 // Quick test using the already prepared vector mask 3563 movl(len, result); 3564 andl(len, 0x0000000f); // tail count (in bytes) 3565 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 3566 ptest(vec1, vec2); 3567 jcc(Assembler::zero, DONE); 3568 jmpb(TAIL_START); 3569 3570 bind(BREAK_LOOP); 3571 // At least one byte in the last 16-byte vector is negative. 3572 // Set up and look at the last 16 bytes as if they were a tail 3573 lea(ary1, Address(ary1, len, Address::times_1)); 3574 addptr(result, len); 3575 // Ignore the very last byte: if all others are positive, 3576 // it must be negative, so we can skip right to the 2+1 byte 3577 // end comparison at this point 3578 orl(result, 15); 3579 movl(len, 15); 3580 // Fallthru to tail compare 3581 } 3582 } 3583 3584 bind(TAIL_START); 3585 // Compare 4-byte vectors 3586 andl(len, 0xfffffffc); // vector count (in bytes) 3587 jccb(Assembler::zero, COMPARE_CHAR); 3588 3589 lea(ary1, Address(ary1, len, Address::times_1)); 3590 negptr(len); 3591 3592 bind(COMPARE_VECTORS); 3593 movl(tmp1, Address(ary1, len, Address::times_1)); 3594 andl(tmp1, 0x80808080); 3595 jccb(Assembler::notZero, TAIL_ADJUST); 3596 addptr(len, 4); 3597 jccb(Assembler::notZero, COMPARE_VECTORS); 3598 3599 // Compare trailing char (final 2-3 bytes), if any 3600 bind(COMPARE_CHAR); 3601 3602 testl(result, 0x2); // tail char 3603 jccb(Assembler::zero, COMPARE_BYTE); 3604 load_unsigned_short(tmp1, Address(ary1, 0)); 3605 andl(tmp1, 0x00008080); 3606 jccb(Assembler::notZero, CHAR_ADJUST); 3607 lea(ary1, Address(ary1, 2)); 3608 3609 bind(COMPARE_BYTE); 3610 testl(result, 0x1); // tail byte 3611 jccb(Assembler::zero, DONE); 3612 load_unsigned_byte(tmp1, Address(ary1, 0)); 3613 testl(tmp1, 0x00000080); 3614 jccb(Assembler::zero, DONE); 3615 subptr(result, 1); 3616 jmpb(DONE); 3617 3618 bind(TAIL_ADJUST); 3619 // there are negative bits in the last 4 byte block. 3620 // Adjust result and check the next three bytes 3621 addptr(result, len); 3622 orl(result, 3); 3623 lea(ary1, Address(ary1, len, Address::times_1)); 3624 jmpb(COMPARE_CHAR); 3625 3626 bind(CHAR_ADJUST); 3627 // We are looking at a char + optional byte tail, and found that one 3628 // of the bytes in the char is negative. Adjust the result, check the 3629 // first byte and readjust if needed. 3630 andl(result, 0xfffffffc); 3631 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 3632 jccb(Assembler::notZero, DONE); 3633 addptr(result, 1); 3634 3635 // That's it 3636 bind(DONE); 3637 if (UseAVX >= 2 && UseSSE >= 2) { 3638 // clean upper bits of YMM registers 3639 vpxor(vec1, vec1); 3640 vpxor(vec2, vec2); 3641 } 3642 } 3643 3644 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 3645 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 3646 Register limit, Register result, Register chr, 3647 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 3648 ShortBranchVerifier sbv(this); 3649 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 3650 3651 int length_offset = arrayOopDesc::length_offset_in_bytes(); 3652 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 3653 3654 if (is_array_equ) { 3655 // Check the input args 3656 cmpoop(ary1, ary2); 3657 jcc(Assembler::equal, TRUE_LABEL); 3658 3659 // Need additional checks for arrays_equals. 3660 testptr(ary1, ary1); 3661 jcc(Assembler::zero, FALSE_LABEL); 3662 testptr(ary2, ary2); 3663 jcc(Assembler::zero, FALSE_LABEL); 3664 3665 // Check the lengths 3666 movl(limit, Address(ary1, length_offset)); 3667 cmpl(limit, Address(ary2, length_offset)); 3668 jcc(Assembler::notEqual, FALSE_LABEL); 3669 } 3670 3671 // count == 0 3672 testl(limit, limit); 3673 jcc(Assembler::zero, TRUE_LABEL); 3674 3675 if (is_array_equ) { 3676 // Load array address 3677 lea(ary1, Address(ary1, base_offset)); 3678 lea(ary2, Address(ary2, base_offset)); 3679 } 3680 3681 if (is_array_equ && is_char) { 3682 // arrays_equals when used for char[]. 3683 shll(limit, 1); // byte count != 0 3684 } 3685 movl(result, limit); // copy 3686 3687 if (UseAVX >= 2) { 3688 // With AVX2, use 32-byte vector compare 3689 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3690 3691 // Compare 32-byte vectors 3692 andl(result, 0x0000001f); // tail count (in bytes) 3693 andl(limit, 0xffffffe0); // vector count (in bytes) 3694 jcc(Assembler::zero, COMPARE_TAIL); 3695 3696 lea(ary1, Address(ary1, limit, Address::times_1)); 3697 lea(ary2, Address(ary2, limit, Address::times_1)); 3698 negptr(limit); 3699 3700 #ifdef _LP64 3701 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3702 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 3703 3704 cmpl(limit, -64); 3705 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3706 3707 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3708 3709 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 3710 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 3711 kortestql(mask, mask); 3712 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3713 addptr(limit, 64); // update since we already compared at this addr 3714 cmpl(limit, -64); 3715 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3716 3717 // At this point we may still need to compare -limit+result bytes. 3718 // We could execute the next two instruction and just continue via non-wide path: 3719 // cmpl(limit, 0); 3720 // jcc(Assembler::equal, COMPARE_TAIL); // true 3721 // But since we stopped at the points ary{1,2}+limit which are 3722 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 3723 // (|limit| <= 32 and result < 32), 3724 // we may just compare the last 64 bytes. 3725 // 3726 addptr(result, -64); // it is safe, bc we just came from this area 3727 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 3728 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 3729 kortestql(mask, mask); 3730 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3731 3732 jmp(TRUE_LABEL); 3733 3734 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3735 3736 }//if (VM_Version::supports_avx512vlbw()) 3737 #endif //_LP64 3738 bind(COMPARE_WIDE_VECTORS); 3739 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 3740 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 3741 vpxor(vec1, vec2); 3742 3743 vptest(vec1, vec1); 3744 jcc(Assembler::notZero, FALSE_LABEL); 3745 addptr(limit, 32); 3746 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3747 3748 testl(result, result); 3749 jcc(Assembler::zero, TRUE_LABEL); 3750 3751 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 3752 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 3753 vpxor(vec1, vec2); 3754 3755 vptest(vec1, vec1); 3756 jccb(Assembler::notZero, FALSE_LABEL); 3757 jmpb(TRUE_LABEL); 3758 3759 bind(COMPARE_TAIL); // limit is zero 3760 movl(limit, result); 3761 // Fallthru to tail compare 3762 } else if (UseSSE42Intrinsics) { 3763 // With SSE4.2, use double quad vector compare 3764 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3765 3766 // Compare 16-byte vectors 3767 andl(result, 0x0000000f); // tail count (in bytes) 3768 andl(limit, 0xfffffff0); // vector count (in bytes) 3769 jcc(Assembler::zero, COMPARE_TAIL); 3770 3771 lea(ary1, Address(ary1, limit, Address::times_1)); 3772 lea(ary2, Address(ary2, limit, Address::times_1)); 3773 negptr(limit); 3774 3775 bind(COMPARE_WIDE_VECTORS); 3776 movdqu(vec1, Address(ary1, limit, Address::times_1)); 3777 movdqu(vec2, Address(ary2, limit, Address::times_1)); 3778 pxor(vec1, vec2); 3779 3780 ptest(vec1, vec1); 3781 jcc(Assembler::notZero, FALSE_LABEL); 3782 addptr(limit, 16); 3783 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3784 3785 testl(result, result); 3786 jcc(Assembler::zero, TRUE_LABEL); 3787 3788 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 3789 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 3790 pxor(vec1, vec2); 3791 3792 ptest(vec1, vec1); 3793 jccb(Assembler::notZero, FALSE_LABEL); 3794 jmpb(TRUE_LABEL); 3795 3796 bind(COMPARE_TAIL); // limit is zero 3797 movl(limit, result); 3798 // Fallthru to tail compare 3799 } 3800 3801 // Compare 4-byte vectors 3802 andl(limit, 0xfffffffc); // vector count (in bytes) 3803 jccb(Assembler::zero, COMPARE_CHAR); 3804 3805 lea(ary1, Address(ary1, limit, Address::times_1)); 3806 lea(ary2, Address(ary2, limit, Address::times_1)); 3807 negptr(limit); 3808 3809 bind(COMPARE_VECTORS); 3810 movl(chr, Address(ary1, limit, Address::times_1)); 3811 cmpl(chr, Address(ary2, limit, Address::times_1)); 3812 jccb(Assembler::notEqual, FALSE_LABEL); 3813 addptr(limit, 4); 3814 jcc(Assembler::notZero, COMPARE_VECTORS); 3815 3816 // Compare trailing char (final 2 bytes), if any 3817 bind(COMPARE_CHAR); 3818 testl(result, 0x2); // tail char 3819 jccb(Assembler::zero, COMPARE_BYTE); 3820 load_unsigned_short(chr, Address(ary1, 0)); 3821 load_unsigned_short(limit, Address(ary2, 0)); 3822 cmpl(chr, limit); 3823 jccb(Assembler::notEqual, FALSE_LABEL); 3824 3825 if (is_array_equ && is_char) { 3826 bind(COMPARE_BYTE); 3827 } else { 3828 lea(ary1, Address(ary1, 2)); 3829 lea(ary2, Address(ary2, 2)); 3830 3831 bind(COMPARE_BYTE); 3832 testl(result, 0x1); // tail byte 3833 jccb(Assembler::zero, TRUE_LABEL); 3834 load_unsigned_byte(chr, Address(ary1, 0)); 3835 load_unsigned_byte(limit, Address(ary2, 0)); 3836 cmpl(chr, limit); 3837 jccb(Assembler::notEqual, FALSE_LABEL); 3838 } 3839 bind(TRUE_LABEL); 3840 movl(result, 1); // return true 3841 jmpb(DONE); 3842 3843 bind(FALSE_LABEL); 3844 xorl(result, result); // return false 3845 3846 // That's it 3847 bind(DONE); 3848 if (UseAVX >= 2) { 3849 // clean upper bits of YMM registers 3850 vpxor(vec1, vec1); 3851 vpxor(vec2, vec2); 3852 } 3853 } 3854 3855 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 3856 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 3857 switch(ideal_opc) { 3858 case Op_LShiftVS: 3859 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 3860 case Op_LShiftVI: 3861 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 3862 case Op_LShiftVL: 3863 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 3864 case Op_RShiftVS: 3865 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 3866 case Op_RShiftVI: 3867 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 3868 case Op_RShiftVL: 3869 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 3870 case Op_URShiftVS: 3871 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 3872 case Op_URShiftVI: 3873 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 3874 case Op_URShiftVL: 3875 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 3876 case Op_RotateRightV: 3877 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 3878 case Op_RotateLeftV: 3879 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 3880 default: 3881 fatal("Unsupported masked operation"); break; 3882 } 3883 } 3884 3885 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 3886 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 3887 bool is_varshift) { 3888 switch (ideal_opc) { 3889 case Op_AddVB: 3890 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 3891 case Op_AddVS: 3892 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 3893 case Op_AddVI: 3894 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 3895 case Op_AddVL: 3896 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 3897 case Op_AddVF: 3898 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 3899 case Op_AddVD: 3900 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 3901 case Op_SubVB: 3902 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 3903 case Op_SubVS: 3904 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 3905 case Op_SubVI: 3906 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 3907 case Op_SubVL: 3908 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 3909 case Op_SubVF: 3910 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 3911 case Op_SubVD: 3912 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 3913 case Op_MulVS: 3914 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 3915 case Op_MulVI: 3916 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 3917 case Op_MulVL: 3918 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 3919 case Op_MulVF: 3920 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 3921 case Op_MulVD: 3922 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 3923 case Op_DivVF: 3924 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 3925 case Op_DivVD: 3926 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 3927 case Op_SqrtVF: 3928 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 3929 case Op_SqrtVD: 3930 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 3931 case Op_AbsVB: 3932 evpabsb(dst, mask, src2, merge, vlen_enc); break; 3933 case Op_AbsVS: 3934 evpabsw(dst, mask, src2, merge, vlen_enc); break; 3935 case Op_AbsVI: 3936 evpabsd(dst, mask, src2, merge, vlen_enc); break; 3937 case Op_AbsVL: 3938 evpabsq(dst, mask, src2, merge, vlen_enc); break; 3939 case Op_FmaVF: 3940 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 3941 case Op_FmaVD: 3942 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 3943 case Op_VectorRearrange: 3944 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 3945 case Op_LShiftVS: 3946 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 3947 case Op_LShiftVI: 3948 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 3949 case Op_LShiftVL: 3950 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 3951 case Op_RShiftVS: 3952 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 3953 case Op_RShiftVI: 3954 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 3955 case Op_RShiftVL: 3956 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 3957 case Op_URShiftVS: 3958 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 3959 case Op_URShiftVI: 3960 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 3961 case Op_URShiftVL: 3962 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 3963 case Op_RotateLeftV: 3964 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 3965 case Op_RotateRightV: 3966 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 3967 case Op_MaxV: 3968 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 3969 case Op_MinV: 3970 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 3971 case Op_XorV: 3972 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 3973 case Op_OrV: 3974 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 3975 case Op_AndV: 3976 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 3977 default: 3978 fatal("Unsupported masked operation"); break; 3979 } 3980 } 3981 3982 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 3983 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 3984 switch (ideal_opc) { 3985 case Op_AddVB: 3986 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 3987 case Op_AddVS: 3988 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 3989 case Op_AddVI: 3990 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 3991 case Op_AddVL: 3992 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 3993 case Op_AddVF: 3994 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 3995 case Op_AddVD: 3996 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 3997 case Op_SubVB: 3998 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 3999 case Op_SubVS: 4000 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4001 case Op_SubVI: 4002 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4003 case Op_SubVL: 4004 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4005 case Op_SubVF: 4006 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4007 case Op_SubVD: 4008 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4009 case Op_MulVS: 4010 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4011 case Op_MulVI: 4012 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4013 case Op_MulVL: 4014 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4015 case Op_MulVF: 4016 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4017 case Op_MulVD: 4018 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4019 case Op_DivVF: 4020 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4021 case Op_DivVD: 4022 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4023 case Op_FmaVF: 4024 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4025 case Op_FmaVD: 4026 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4027 case Op_MaxV: 4028 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4029 case Op_MinV: 4030 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4031 case Op_XorV: 4032 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4033 case Op_OrV: 4034 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4035 case Op_AndV: 4036 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4037 default: 4038 fatal("Unsupported masked operation"); break; 4039 } 4040 } 4041 4042 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4043 KRegister src1, KRegister src2) { 4044 BasicType etype = T_ILLEGAL; 4045 switch(mask_len) { 4046 case 2: 4047 case 4: 4048 case 8: etype = T_BYTE; break; 4049 case 16: etype = T_SHORT; break; 4050 case 32: etype = T_INT; break; 4051 case 64: etype = T_LONG; break; 4052 default: fatal("Unsupported type"); break; 4053 } 4054 assert(etype != T_ILLEGAL, ""); 4055 switch(ideal_opc) { 4056 case Op_AndVMask: 4057 kand(etype, dst, src1, src2); break; 4058 case Op_OrVMask: 4059 kor(etype, dst, src1, src2); break; 4060 case Op_XorVMask: 4061 kxor(etype, dst, src1, src2); break; 4062 default: 4063 fatal("Unsupported masked operation"); break; 4064 } 4065 } 4066 4067 /* 4068 * Algorithm for vector D2L and F2I conversions:- 4069 * a) Perform vector D2L/F2I cast. 4070 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 4071 * It signifies that source value could be any of the special floating point 4072 * values(NaN,-Inf,Inf,Max,-Min). 4073 * c) Set destination to zero if source is NaN value. 4074 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 4075 */ 4076 4077 void C2_MacroAssembler::vector_castD2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4078 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 4079 Register scratch, int vec_enc) { 4080 Label done; 4081 evcvttpd2qq(dst, src, vec_enc); 4082 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, scratch); 4083 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4084 kortestwl(ktmp1, ktmp1); 4085 jccb(Assembler::equal, done); 4086 4087 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4088 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4089 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4090 4091 kxorwl(ktmp1, ktmp1, ktmp2); 4092 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4093 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4094 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4095 bind(done); 4096 } 4097 4098 void C2_MacroAssembler::vector_castF2I_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4099 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4100 AddressLiteral float_sign_flip, Register scratch, int vec_enc) { 4101 Label done; 4102 vcvttps2dq(dst, src, vec_enc); 4103 vmovdqu(xtmp1, float_sign_flip, scratch, vec_enc); 4104 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4105 vptest(xtmp2, xtmp2, vec_enc); 4106 jccb(Assembler::equal, done); 4107 4108 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4109 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4110 4111 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4112 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4113 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4114 4115 // Recompute the mask for remaining special value. 4116 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4117 // Extract SRC values corresponding to TRUE mask lanes. 4118 vpand(xtmp4, xtmp2, src, vec_enc); 4119 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4120 // values are set. 4121 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4122 4123 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4124 bind(done); 4125 } 4126 4127 void C2_MacroAssembler::vector_castF2I_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4128 KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 4129 Register scratch, int vec_enc) { 4130 Label done; 4131 vcvttps2dq(dst, src, vec_enc); 4132 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, scratch); 4133 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4134 kortestwl(ktmp1, ktmp1); 4135 jccb(Assembler::equal, done); 4136 4137 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4138 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4139 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4140 4141 kxorwl(ktmp1, ktmp1, ktmp2); 4142 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4143 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4144 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4145 bind(done); 4146 } 4147 4148 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 4149 BasicType from_elem_bt, BasicType to_elem_bt) { 4150 switch (from_elem_bt) { 4151 case T_BYTE: 4152 switch (to_elem_bt) { 4153 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 4154 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 4155 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 4156 default: ShouldNotReachHere(); 4157 } 4158 break; 4159 case T_SHORT: 4160 switch (to_elem_bt) { 4161 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 4162 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 4163 default: ShouldNotReachHere(); 4164 } 4165 break; 4166 case T_INT: 4167 assert(to_elem_bt == T_LONG, ""); 4168 vpmovzxdq(dst, src, vlen_enc); 4169 break; 4170 default: 4171 ShouldNotReachHere(); 4172 } 4173 } 4174 4175 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 4176 bool merge, BasicType bt, int vlen_enc) { 4177 if (bt == T_INT) { 4178 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 4179 } else { 4180 assert(bt == T_LONG, ""); 4181 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 4182 } 4183 } 4184 4185 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 4186 bool merge, BasicType bt, int vlen_enc) { 4187 if (bt == T_INT) { 4188 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 4189 } else { 4190 assert(bt == T_LONG, ""); 4191 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 4192 } 4193 } 4194 4195 #ifdef _LP64 4196 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 4197 Register rtmp2, XMMRegister xtmp, int mask_len, 4198 int vec_enc) { 4199 int index = 0; 4200 int vindex = 0; 4201 mov64(rtmp1, 0x0101010101010101L); 4202 pdep(rtmp1, src, rtmp1); 4203 if (mask_len > 8) { 4204 movq(rtmp2, src); 4205 vpxor(xtmp, xtmp, xtmp, vec_enc); 4206 movq(xtmp, rtmp1); 4207 } 4208 movq(dst, rtmp1); 4209 4210 mask_len -= 8; 4211 while (mask_len > 0) { 4212 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 4213 index++; 4214 if ((index % 2) == 0) { 4215 pxor(xtmp, xtmp); 4216 } 4217 mov64(rtmp1, 0x0101010101010101L); 4218 shrq(rtmp2, 8); 4219 pdep(rtmp1, rtmp2, rtmp1); 4220 pinsrq(xtmp, rtmp1, index % 2); 4221 vindex = index / 2; 4222 if (vindex) { 4223 // Write entire 16 byte vector when both 64 bit 4224 // lanes are update to save redundant instructions. 4225 if (index % 2) { 4226 vinsertf128(dst, dst, xtmp, vindex); 4227 } 4228 } else { 4229 vmovdqu(dst, xtmp); 4230 } 4231 mask_len -= 8; 4232 } 4233 } 4234 4235 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 4236 switch(opc) { 4237 case Op_VectorMaskTrueCount: 4238 popcntq(dst, tmp); 4239 break; 4240 case Op_VectorMaskLastTrue: 4241 if (VM_Version::supports_lzcnt()) { 4242 lzcntq(tmp, tmp); 4243 movl(dst, 63); 4244 subl(dst, tmp); 4245 } else { 4246 movl(dst, -1); 4247 bsrq(tmp, tmp); 4248 cmov32(Assembler::notZero, dst, tmp); 4249 } 4250 break; 4251 case Op_VectorMaskFirstTrue: 4252 if (VM_Version::supports_bmi1()) { 4253 if (masklen < 32) { 4254 orl(tmp, 1 << masklen); 4255 tzcntl(dst, tmp); 4256 } else if (masklen == 32) { 4257 tzcntl(dst, tmp); 4258 } else { 4259 assert(masklen == 64, ""); 4260 tzcntq(dst, tmp); 4261 } 4262 } else { 4263 if (masklen < 32) { 4264 orl(tmp, 1 << masklen); 4265 bsfl(dst, tmp); 4266 } else { 4267 assert(masklen == 32 || masklen == 64, ""); 4268 movl(dst, masklen); 4269 if (masklen == 32) { 4270 bsfl(tmp, tmp); 4271 } else { 4272 bsfq(tmp, tmp); 4273 } 4274 cmov32(Assembler::notZero, dst, tmp); 4275 } 4276 } 4277 break; 4278 case Op_VectorMaskToLong: 4279 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 4280 break; 4281 default: assert(false, "Unhandled mask operation"); 4282 } 4283 } 4284 4285 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 4286 int masklen, int masksize, int vec_enc) { 4287 assert(VM_Version::supports_popcnt(), ""); 4288 4289 if(VM_Version::supports_avx512bw()) { 4290 kmovql(tmp, mask); 4291 } else { 4292 assert(masklen <= 16, ""); 4293 kmovwl(tmp, mask); 4294 } 4295 4296 // Mask generated out of partial vector comparisons/replicate/mask manipulation 4297 // operations needs to be clipped. 4298 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 4299 andq(tmp, (1 << masklen) - 1); 4300 } 4301 4302 vector_mask_operation_helper(opc, dst, tmp, masklen); 4303 } 4304 4305 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 4306 Register tmp, int masklen, BasicType bt, int vec_enc) { 4307 assert(vec_enc == AVX_128bit && VM_Version::supports_avx() || 4308 vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), ""); 4309 assert(VM_Version::supports_popcnt(), ""); 4310 4311 bool need_clip = false; 4312 switch(bt) { 4313 case T_BOOLEAN: 4314 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 4315 vpxor(xtmp, xtmp, xtmp, vec_enc); 4316 vpsubb(xtmp, xtmp, mask, vec_enc); 4317 vpmovmskb(tmp, xtmp, vec_enc); 4318 need_clip = masklen < 16; 4319 break; 4320 case T_BYTE: 4321 vpmovmskb(tmp, mask, vec_enc); 4322 need_clip = masklen < 16; 4323 break; 4324 case T_SHORT: 4325 vpacksswb(xtmp, mask, mask, vec_enc); 4326 if (masklen >= 16) { 4327 vpermpd(xtmp, xtmp, 8, vec_enc); 4328 } 4329 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 4330 need_clip = masklen < 16; 4331 break; 4332 case T_INT: 4333 case T_FLOAT: 4334 vmovmskps(tmp, mask, vec_enc); 4335 need_clip = masklen < 4; 4336 break; 4337 case T_LONG: 4338 case T_DOUBLE: 4339 vmovmskpd(tmp, mask, vec_enc); 4340 need_clip = masklen < 2; 4341 break; 4342 default: assert(false, "Unhandled type, %s", type2name(bt)); 4343 } 4344 4345 // Mask generated out of partial vector comparisons/replicate/mask manipulation 4346 // operations needs to be clipped. 4347 if (need_clip && opc != Op_VectorMaskFirstTrue) { 4348 // need_clip implies masklen < 32 4349 andq(tmp, (1 << masklen) - 1); 4350 } 4351 4352 vector_mask_operation_helper(opc, dst, tmp, masklen); 4353 } 4354 #endif 4355 4356 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 4357 if (VM_Version::supports_avx512bw()) { 4358 if (mask_len > 32) { 4359 kmovql(dst, src); 4360 } else { 4361 kmovdl(dst, src); 4362 if (mask_len != 32) { 4363 kshiftrdl(dst, dst, 32 - mask_len); 4364 } 4365 } 4366 } else { 4367 assert(mask_len <= 16, ""); 4368 kmovwl(dst, src); 4369 if (mask_len != 16) { 4370 kshiftrwl(dst, dst, 16 - mask_len); 4371 } 4372 } 4373 } 4374 4375 4376 // 4377 // Following is lookup table based popcount computation algorithm:- 4378 // Index Bit set count 4379 // [ 0000 -> 0, 4380 // 0001 -> 1, 4381 // 0010 -> 1, 4382 // 0011 -> 2, 4383 // 0100 -> 1, 4384 // 0101 -> 2, 4385 // 0110 -> 2, 4386 // 0111 -> 3, 4387 // 1000 -> 1, 4388 // 1001 -> 2, 4389 // 1010 -> 3, 4390 // 1011 -> 3, 4391 // 1100 -> 2, 4392 // 1101 -> 3, 4393 // 1111 -> 4 ] 4394 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 4395 // shuffle indices for lookup table access. 4396 // b. Right shift each byte of vector lane by 4 positions. 4397 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 4398 // shuffle indices for lookup table access. 4399 // d. Add the bitset count of upper and lower 4 bits of each byte. 4400 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 4401 // count of all the bytes of a quadword. 4402 // f. Perform step e. for upper 128bit vector lane. 4403 // g. Pack the bitset count of quadwords back to double word. 4404 // h. Unpacking and packing operations are not needed for 64bit vector lane. 4405 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4406 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, 4407 int vec_enc) { 4408 if (VM_Version::supports_avx512_vpopcntdq()) { 4409 vpopcntd(dst, src, vec_enc); 4410 } else { 4411 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 4412 movl(rtmp, 0x0F0F0F0F); 4413 movdl(xtmp1, rtmp); 4414 vpbroadcastd(xtmp1, xtmp1, vec_enc); 4415 if (Assembler::AVX_512bit == vec_enc) { 4416 evmovdqul(xtmp2, k0, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), false, vec_enc, rtmp); 4417 } else { 4418 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), rtmp); 4419 } 4420 vpand(xtmp3, src, xtmp1, vec_enc); 4421 vpshufb(xtmp3, xtmp2, xtmp3, vec_enc); 4422 vpsrlw(dst, src, 4, vec_enc); 4423 vpand(dst, dst, xtmp1, vec_enc); 4424 vpshufb(dst, xtmp2, dst, vec_enc); 4425 vpaddb(xtmp3, dst, xtmp3, vec_enc); 4426 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 4427 vpunpckhdq(dst, xtmp3, xtmp1, vec_enc); 4428 vpsadbw(dst, dst, xtmp1, vec_enc); 4429 vpunpckldq(xtmp2, xtmp3, xtmp1, vec_enc); 4430 vpsadbw(xtmp2, xtmp2, xtmp1, vec_enc); 4431 vpackuswb(dst, xtmp2, dst, vec_enc); 4432 } 4433 } 4434 4435 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4436 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, 4437 int vec_enc) { 4438 if (VM_Version::supports_avx512_vpopcntdq()) { 4439 vpopcntq(dst, src, vec_enc); 4440 } else if (vec_enc == Assembler::AVX_512bit) { 4441 assert(VM_Version::supports_avx512bw(), ""); 4442 movl(rtmp, 0x0F0F0F0F); 4443 movdl(xtmp1, rtmp); 4444 vpbroadcastd(xtmp1, xtmp1, vec_enc); 4445 evmovdqul(xtmp2, k0, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), true, vec_enc, rtmp); 4446 vpandq(xtmp3, src, xtmp1, vec_enc); 4447 vpshufb(xtmp3, xtmp2, xtmp3, vec_enc); 4448 vpsrlw(dst, src, 4, vec_enc); 4449 vpandq(dst, dst, xtmp1, vec_enc); 4450 vpshufb(dst, xtmp2, dst, vec_enc); 4451 vpaddb(xtmp3, dst, xtmp3, vec_enc); 4452 vpxorq(xtmp1, xtmp1, xtmp1, vec_enc); 4453 vpsadbw(dst, xtmp3, xtmp1, vec_enc); 4454 } else { 4455 // We do not see any performance benefit of running 4456 // above instruction sequence on 256 bit vector which 4457 // can operate over maximum 4 long elements. 4458 ShouldNotReachHere(); 4459 } 4460 evpmovqd(dst, dst, vec_enc); 4461 } 4462 4463 #ifndef _LP64 4464 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 4465 assert(VM_Version::supports_avx512bw(), ""); 4466 kmovdl(tmp, src); 4467 kunpckdql(dst, tmp, tmp); 4468 } 4469 #endif