1 /* 2 * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "oops/methodData.hpp" 29 #include "opto/c2_MacroAssembler.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/opcodes.hpp" 32 #include "opto/subnode.hpp" 33 #include "runtime/objectMonitor.hpp" 34 #include "runtime/stubRoutines.hpp" 35 36 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 37 switch (vlen_in_bytes) { 38 case 4: // fall-through 39 case 8: // fall-through 40 case 16: return Assembler::AVX_128bit; 41 case 32: return Assembler::AVX_256bit; 42 case 64: return Assembler::AVX_512bit; 43 44 default: { 45 ShouldNotReachHere(); 46 return Assembler::AVX_NoVec; 47 } 48 } 49 } 50 51 #if INCLUDE_RTM_OPT 52 53 // Update rtm_counters based on abort status 54 // input: abort_status 55 // rtm_counters (RTMLockingCounters*) 56 // flags are killed 57 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 58 59 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 60 if (PrintPreciseRTMLockingStatistics) { 61 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 62 Label check_abort; 63 testl(abort_status, (1<<i)); 64 jccb(Assembler::equal, check_abort); 65 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 66 bind(check_abort); 67 } 68 } 69 } 70 71 // Branch if (random & (count-1) != 0), count is 2^n 72 // tmp, scr and flags are killed 73 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 74 assert(tmp == rax, ""); 75 assert(scr == rdx, ""); 76 rdtsc(); // modifies EDX:EAX 77 andptr(tmp, count-1); 78 jccb(Assembler::notZero, brLabel); 79 } 80 81 // Perform abort ratio calculation, set no_rtm bit if high ratio 82 // input: rtm_counters_Reg (RTMLockingCounters* address) 83 // tmpReg, rtm_counters_Reg and flags are killed 84 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 85 Register rtm_counters_Reg, 86 RTMLockingCounters* rtm_counters, 87 Metadata* method_data) { 88 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 89 90 if (RTMLockingCalculationDelay > 0) { 91 // Delay calculation 92 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg); 93 testptr(tmpReg, tmpReg); 94 jccb(Assembler::equal, L_done); 95 } 96 // Abort ratio calculation only if abort_count > RTMAbortThreshold 97 // Aborted transactions = abort_count * 100 98 // All transactions = total_count * RTMTotalCountIncrRate 99 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 100 101 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 102 cmpptr(tmpReg, RTMAbortThreshold); 103 jccb(Assembler::below, L_check_always_rtm2); 104 imulptr(tmpReg, tmpReg, 100); 105 106 Register scrReg = rtm_counters_Reg; 107 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 108 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 109 imulptr(scrReg, scrReg, RTMAbortRatio); 110 cmpptr(tmpReg, scrReg); 111 jccb(Assembler::below, L_check_always_rtm1); 112 if (method_data != NULL) { 113 // set rtm_state to "no rtm" in MDO 114 mov_metadata(tmpReg, method_data); 115 lock(); 116 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM); 117 } 118 jmpb(L_done); 119 bind(L_check_always_rtm1); 120 // Reload RTMLockingCounters* address 121 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 122 bind(L_check_always_rtm2); 123 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 124 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 125 jccb(Assembler::below, L_done); 126 if (method_data != NULL) { 127 // set rtm_state to "always rtm" in MDO 128 mov_metadata(tmpReg, method_data); 129 lock(); 130 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM); 131 } 132 bind(L_done); 133 } 134 135 // Update counters and perform abort ratio calculation 136 // input: abort_status_Reg 137 // rtm_counters_Reg, flags are killed 138 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 139 Register rtm_counters_Reg, 140 RTMLockingCounters* rtm_counters, 141 Metadata* method_data, 142 bool profile_rtm) { 143 144 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 145 // update rtm counters based on rax value at abort 146 // reads abort_status_Reg, updates flags 147 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 148 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 149 if (profile_rtm) { 150 // Save abort status because abort_status_Reg is used by following code. 151 if (RTMRetryCount > 0) { 152 push(abort_status_Reg); 153 } 154 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 155 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 156 // restore abort status 157 if (RTMRetryCount > 0) { 158 pop(abort_status_Reg); 159 } 160 } 161 } 162 163 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 164 // inputs: retry_count_Reg 165 // : abort_status_Reg 166 // output: retry_count_Reg decremented by 1 167 // flags are killed 168 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 169 Label doneRetry; 170 assert(abort_status_Reg == rax, ""); 171 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 172 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 173 // if reason is in 0x6 and retry count != 0 then retry 174 andptr(abort_status_Reg, 0x6); 175 jccb(Assembler::zero, doneRetry); 176 testl(retry_count_Reg, retry_count_Reg); 177 jccb(Assembler::zero, doneRetry); 178 pause(); 179 decrementl(retry_count_Reg); 180 jmp(retryLabel); 181 bind(doneRetry); 182 } 183 184 // Spin and retry if lock is busy, 185 // inputs: box_Reg (monitor address) 186 // : retry_count_Reg 187 // output: retry_count_Reg decremented by 1 188 // : clear z flag if retry count exceeded 189 // tmp_Reg, scr_Reg, flags are killed 190 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 191 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 192 Label SpinLoop, SpinExit, doneRetry; 193 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 194 195 testl(retry_count_Reg, retry_count_Reg); 196 jccb(Assembler::zero, doneRetry); 197 decrementl(retry_count_Reg); 198 movptr(scr_Reg, RTMSpinLoopCount); 199 200 bind(SpinLoop); 201 pause(); 202 decrementl(scr_Reg); 203 jccb(Assembler::lessEqual, SpinExit); 204 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 205 testptr(tmp_Reg, tmp_Reg); 206 jccb(Assembler::notZero, SpinLoop); 207 208 bind(SpinExit); 209 jmp(retryLabel); 210 bind(doneRetry); 211 incrementl(retry_count_Reg); // clear z flag 212 } 213 214 // Use RTM for normal stack locks 215 // Input: objReg (object to lock) 216 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 217 Register retry_on_abort_count_Reg, 218 RTMLockingCounters* stack_rtm_counters, 219 Metadata* method_data, bool profile_rtm, 220 Label& DONE_LABEL, Label& IsInflated) { 221 assert(UseRTMForStackLocks, "why call this otherwise?"); 222 assert(tmpReg == rax, ""); 223 assert(scrReg == rdx, ""); 224 Label L_rtm_retry, L_decrement_retry, L_on_abort; 225 226 if (RTMRetryCount > 0) { 227 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 228 bind(L_rtm_retry); 229 } 230 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 231 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 232 jcc(Assembler::notZero, IsInflated); 233 234 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 235 Label L_noincrement; 236 if (RTMTotalCountIncrRate > 1) { 237 // tmpReg, scrReg and flags are killed 238 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 239 } 240 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 241 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 242 bind(L_noincrement); 243 } 244 xbegin(L_on_abort); 245 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 246 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 247 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 248 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 249 250 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 251 if (UseRTMXendForLockBusy) { 252 xend(); 253 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 254 jmp(L_decrement_retry); 255 } 256 else { 257 xabort(0); 258 } 259 bind(L_on_abort); 260 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 261 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 262 } 263 bind(L_decrement_retry); 264 if (RTMRetryCount > 0) { 265 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 266 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 267 } 268 } 269 270 // Use RTM for inflating locks 271 // inputs: objReg (object to lock) 272 // boxReg (on-stack box address (displaced header location) - KILLED) 273 // tmpReg (ObjectMonitor address + markWord::monitor_value) 274 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 275 Register scrReg, Register retry_on_busy_count_Reg, 276 Register retry_on_abort_count_Reg, 277 RTMLockingCounters* rtm_counters, 278 Metadata* method_data, bool profile_rtm, 279 Label& DONE_LABEL) { 280 assert(UseRTMLocking, "why call this otherwise?"); 281 assert(tmpReg == rax, ""); 282 assert(scrReg == rdx, ""); 283 Label L_rtm_retry, L_decrement_retry, L_on_abort; 284 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 285 286 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 287 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 288 movptr(boxReg, tmpReg); // Save ObjectMonitor address 289 290 if (RTMRetryCount > 0) { 291 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 292 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 293 bind(L_rtm_retry); 294 } 295 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 296 Label L_noincrement; 297 if (RTMTotalCountIncrRate > 1) { 298 // tmpReg, scrReg and flags are killed 299 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 300 } 301 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 302 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 303 bind(L_noincrement); 304 } 305 xbegin(L_on_abort); 306 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 307 movptr(tmpReg, Address(tmpReg, owner_offset)); 308 testptr(tmpReg, tmpReg); 309 jcc(Assembler::zero, DONE_LABEL); 310 if (UseRTMXendForLockBusy) { 311 xend(); 312 jmp(L_decrement_retry); 313 } 314 else { 315 xabort(0); 316 } 317 bind(L_on_abort); 318 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 319 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 320 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 321 } 322 if (RTMRetryCount > 0) { 323 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 324 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 325 } 326 327 movptr(tmpReg, Address(boxReg, owner_offset)) ; 328 testptr(tmpReg, tmpReg) ; 329 jccb(Assembler::notZero, L_decrement_retry) ; 330 331 // Appears unlocked - try to swing _owner from null to non-null. 332 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 333 #ifdef _LP64 334 Register threadReg = r15_thread; 335 #else 336 get_thread(scrReg); 337 Register threadReg = scrReg; 338 #endif 339 lock(); 340 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 341 342 if (RTMRetryCount > 0) { 343 // success done else retry 344 jccb(Assembler::equal, DONE_LABEL) ; 345 bind(L_decrement_retry); 346 // Spin and retry if lock is busy. 347 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 348 } 349 else { 350 bind(L_decrement_retry); 351 } 352 } 353 354 #endif // INCLUDE_RTM_OPT 355 356 // fast_lock and fast_unlock used by C2 357 358 // Because the transitions from emitted code to the runtime 359 // monitorenter/exit helper stubs are so slow it's critical that 360 // we inline both the stack-locking fast path and the inflated fast path. 361 // 362 // See also: cmpFastLock and cmpFastUnlock. 363 // 364 // What follows is a specialized inline transliteration of the code 365 // in enter() and exit(). If we're concerned about I$ bloat another 366 // option would be to emit TrySlowEnter and TrySlowExit methods 367 // at startup-time. These methods would accept arguments as 368 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 369 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 370 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 371 // In practice, however, the # of lock sites is bounded and is usually small. 372 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 373 // if the processor uses simple bimodal branch predictors keyed by EIP 374 // Since the helper routines would be called from multiple synchronization 375 // sites. 376 // 377 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 378 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 379 // to those specialized methods. That'd give us a mostly platform-independent 380 // implementation that the JITs could optimize and inline at their pleasure. 381 // Done correctly, the only time we'd need to cross to native could would be 382 // to park() or unpark() threads. We'd also need a few more unsafe operators 383 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 384 // (b) explicit barriers or fence operations. 385 // 386 // TODO: 387 // 388 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 389 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 390 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 391 // the lock operators would typically be faster than reifying Self. 392 // 393 // * Ideally I'd define the primitives as: 394 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 395 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 396 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 397 // Instead, we're stuck with a rather awkward and brittle register assignments below. 398 // Furthermore the register assignments are overconstrained, possibly resulting in 399 // sub-optimal code near the synchronization site. 400 // 401 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 402 // Alternately, use a better sp-proximity test. 403 // 404 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 405 // Either one is sufficient to uniquely identify a thread. 406 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 407 // 408 // * Intrinsify notify() and notifyAll() for the common cases where the 409 // object is locked by the calling thread but the waitlist is empty. 410 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 411 // 412 // * use jccb and jmpb instead of jcc and jmp to improve code density. 413 // But beware of excessive branch density on AMD Opterons. 414 // 415 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 416 // or failure of the fast path. If the fast path fails then we pass 417 // control to the slow path, typically in C. In fast_lock and 418 // fast_unlock we often branch to DONE_LABEL, just to find that C2 419 // will emit a conditional branch immediately after the node. 420 // So we have branches to branches and lots of ICC.ZF games. 421 // Instead, it might be better to have C2 pass a "FailureLabel" 422 // into fast_lock and fast_unlock. In the case of success, control 423 // will drop through the node. ICC.ZF is undefined at exit. 424 // In the case of failure, the node will branch directly to the 425 // FailureLabel 426 427 428 // obj: object to lock 429 // box: on-stack box address (displaced header location) - KILLED 430 // rax,: tmp -- KILLED 431 // scr: tmp -- KILLED 432 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 433 Register scrReg, Register cx1Reg, Register cx2Reg, 434 RTMLockingCounters* rtm_counters, 435 RTMLockingCounters* stack_rtm_counters, 436 Metadata* method_data, 437 bool use_rtm, bool profile_rtm) { 438 // Ensure the register assignments are disjoint 439 assert(tmpReg == rax, ""); 440 441 if (use_rtm) { 442 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 443 } else { 444 assert(cx2Reg == noreg, ""); 445 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 446 } 447 448 // Possible cases that we'll encounter in fast_lock 449 // ------------------------------------------------ 450 // * Inflated 451 // -- unlocked 452 // -- Locked 453 // = by self 454 // = by other 455 // * neutral 456 // * stack-locked 457 // -- by self 458 // = sp-proximity test hits 459 // = sp-proximity test generates false-negative 460 // -- by other 461 // 462 463 Label IsInflated, DONE_LABEL; 464 465 if (DiagnoseSyncOnValueBasedClasses != 0) { 466 load_klass(tmpReg, objReg, cx1Reg); 467 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 468 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 469 jcc(Assembler::notZero, DONE_LABEL); 470 } 471 472 #if INCLUDE_RTM_OPT 473 if (UseRTMForStackLocks && use_rtm) { 474 assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive"); 475 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 476 stack_rtm_counters, method_data, profile_rtm, 477 DONE_LABEL, IsInflated); 478 } 479 #endif // INCLUDE_RTM_OPT 480 481 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 482 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 483 jccb(Assembler::notZero, IsInflated); 484 485 if (!UseHeavyMonitors) { 486 // Attempt stack-locking ... 487 orptr (tmpReg, markWord::unlocked_value); 488 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 489 lock(); 490 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 491 jcc(Assembler::equal, DONE_LABEL); // Success 492 493 // Recursive locking. 494 // The object is stack-locked: markword contains stack pointer to BasicLock. 495 // Locked by current thread if difference with current SP is less than one page. 496 subptr(tmpReg, rsp); 497 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 498 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) ); 499 movptr(Address(boxReg, 0), tmpReg); 500 } else { 501 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 502 testptr(objReg, objReg); 503 } 504 jmp(DONE_LABEL); 505 506 bind(IsInflated); 507 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 508 509 #if INCLUDE_RTM_OPT 510 // Use the same RTM locking code in 32- and 64-bit VM. 511 if (use_rtm) { 512 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 513 rtm_counters, method_data, profile_rtm, DONE_LABEL); 514 } else { 515 #endif // INCLUDE_RTM_OPT 516 517 #ifndef _LP64 518 // The object is inflated. 519 520 // boxReg refers to the on-stack BasicLock in the current frame. 521 // We'd like to write: 522 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 523 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 524 // additional latency as we have another ST in the store buffer that must drain. 525 526 // avoid ST-before-CAS 527 // register juggle because we need tmpReg for cmpxchgptr below 528 movptr(scrReg, boxReg); 529 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 530 531 // Optimistic form: consider XORL tmpReg,tmpReg 532 movptr(tmpReg, NULL_WORD); 533 534 // Appears unlocked - try to swing _owner from null to non-null. 535 // Ideally, I'd manifest "Self" with get_thread and then attempt 536 // to CAS the register containing Self into m->Owner. 537 // But we don't have enough registers, so instead we can either try to CAS 538 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 539 // we later store "Self" into m->Owner. Transiently storing a stack address 540 // (rsp or the address of the box) into m->owner is harmless. 541 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 542 lock(); 543 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 544 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 545 // If we weren't able to swing _owner from NULL to the BasicLock 546 // then take the slow path. 547 jccb (Assembler::notZero, DONE_LABEL); 548 // update _owner from BasicLock to thread 549 get_thread (scrReg); // beware: clobbers ICCs 550 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 551 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 552 553 // If the CAS fails we can either retry or pass control to the slow path. 554 // We use the latter tactic. 555 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 556 // If the CAS was successful ... 557 // Self has acquired the lock 558 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 559 // Intentional fall-through into DONE_LABEL ... 560 #else // _LP64 561 // It's inflated and we use scrReg for ObjectMonitor* in this section. 562 movq(scrReg, tmpReg); 563 xorq(tmpReg, tmpReg); 564 lock(); 565 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 566 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 567 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 568 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 569 // Propagate ICC.ZF from CAS above into DONE_LABEL. 570 jcc(Assembler::equal, DONE_LABEL); // CAS above succeeded; propagate ZF = 1 (success) 571 572 cmpptr(r15_thread, rax); // Check if we are already the owner (recursive lock) 573 jcc(Assembler::notEqual, DONE_LABEL); // If not recursive, ZF = 0 at this point (fail) 574 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 575 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 576 #endif // _LP64 577 #if INCLUDE_RTM_OPT 578 } // use_rtm() 579 #endif 580 // DONE_LABEL is a hot target - we'd really like to place it at the 581 // start of cache line by padding with NOPs. 582 // See the AMD and Intel software optimization manuals for the 583 // most efficient "long" NOP encodings. 584 // Unfortunately none of our alignment mechanisms suffice. 585 bind(DONE_LABEL); 586 587 // At DONE_LABEL the icc ZFlag is set as follows ... 588 // fast_unlock uses the same protocol. 589 // ZFlag == 1 -> Success 590 // ZFlag == 0 -> Failure - force control through the slow path 591 } 592 593 // obj: object to unlock 594 // box: box address (displaced header location), killed. Must be EAX. 595 // tmp: killed, cannot be obj nor box. 596 // 597 // Some commentary on balanced locking: 598 // 599 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 600 // Methods that don't have provably balanced locking are forced to run in the 601 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 602 // The interpreter provides two properties: 603 // I1: At return-time the interpreter automatically and quietly unlocks any 604 // objects acquired the current activation (frame). Recall that the 605 // interpreter maintains an on-stack list of locks currently held by 606 // a frame. 607 // I2: If a method attempts to unlock an object that is not held by the 608 // the frame the interpreter throws IMSX. 609 // 610 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 611 // B() doesn't have provably balanced locking so it runs in the interpreter. 612 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 613 // is still locked by A(). 614 // 615 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 616 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 617 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 618 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 619 // Arguably given that the spec legislates the JNI case as undefined our implementation 620 // could reasonably *avoid* checking owner in fast_unlock(). 621 // In the interest of performance we elide m->Owner==Self check in unlock. 622 // A perfectly viable alternative is to elide the owner check except when 623 // Xcheck:jni is enabled. 624 625 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 626 assert(boxReg == rax, ""); 627 assert_different_registers(objReg, boxReg, tmpReg); 628 629 Label DONE_LABEL, Stacked, CheckSucc; 630 631 #if INCLUDE_RTM_OPT 632 if (UseRTMForStackLocks && use_rtm) { 633 assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive"); 634 Label L_regular_unlock; 635 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 636 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 637 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 638 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 639 xend(); // otherwise end... 640 jmp(DONE_LABEL); // ... and we're done 641 bind(L_regular_unlock); 642 } 643 #endif 644 645 if (!UseHeavyMonitors) { 646 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header 647 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock 648 } 649 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 650 if (!UseHeavyMonitors) { 651 testptr(tmpReg, markWord::monitor_value); // Inflated? 652 jccb (Assembler::zero, Stacked); 653 } 654 655 // It's inflated. 656 #if INCLUDE_RTM_OPT 657 if (use_rtm) { 658 Label L_regular_inflated_unlock; 659 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 660 movptr(boxReg, Address(tmpReg, owner_offset)); 661 testptr(boxReg, boxReg); 662 jccb(Assembler::notZero, L_regular_inflated_unlock); 663 xend(); 664 jmpb(DONE_LABEL); 665 bind(L_regular_inflated_unlock); 666 } 667 #endif 668 669 // Despite our balanced locking property we still check that m->_owner == Self 670 // as java routines or native JNI code called by this thread might 671 // have released the lock. 672 // Refer to the comments in synchronizer.cpp for how we might encode extra 673 // state in _succ so we can avoid fetching EntryList|cxq. 674 // 675 // If there's no contention try a 1-0 exit. That is, exit without 676 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 677 // we detect and recover from the race that the 1-0 exit admits. 678 // 679 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 680 // before it STs null into _owner, releasing the lock. Updates 681 // to data protected by the critical section must be visible before 682 // we drop the lock (and thus before any other thread could acquire 683 // the lock and observe the fields protected by the lock). 684 // IA32's memory-model is SPO, so STs are ordered with respect to 685 // each other and there's no need for an explicit barrier (fence). 686 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 687 #ifndef _LP64 688 get_thread (boxReg); 689 690 // Note that we could employ various encoding schemes to reduce 691 // the number of loads below (currently 4) to just 2 or 3. 692 // Refer to the comments in synchronizer.cpp. 693 // In practice the chain of fetches doesn't seem to impact performance, however. 694 xorptr(boxReg, boxReg); 695 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 696 jccb (Assembler::notZero, DONE_LABEL); 697 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 698 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 699 jccb (Assembler::notZero, CheckSucc); 700 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 701 jmpb (DONE_LABEL); 702 703 bind (Stacked); 704 // It's not inflated and it's not recursively stack-locked. 705 // It must be stack-locked. 706 // Try to reset the header to displaced header. 707 // The "box" value on the stack is stable, so we can reload 708 // and be assured we observe the same value as above. 709 movptr(tmpReg, Address(boxReg, 0)); 710 lock(); 711 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 712 // Intention fall-thru into DONE_LABEL 713 714 // DONE_LABEL is a hot target - we'd really like to place it at the 715 // start of cache line by padding with NOPs. 716 // See the AMD and Intel software optimization manuals for the 717 // most efficient "long" NOP encodings. 718 // Unfortunately none of our alignment mechanisms suffice. 719 bind (CheckSucc); 720 #else // _LP64 721 // It's inflated 722 Label LNotRecursive, LSuccess, LGoSlowPath; 723 724 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 725 jccb(Assembler::equal, LNotRecursive); 726 727 // Recursive inflated unlock 728 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 729 jmpb(LSuccess); 730 731 bind(LNotRecursive); 732 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 733 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 734 jccb (Assembler::notZero, CheckSucc); 735 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 736 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 737 jmpb (DONE_LABEL); 738 739 // Try to avoid passing control into the slow_path ... 740 bind (CheckSucc); 741 742 // The following optional optimization can be elided if necessary 743 // Effectively: if (succ == null) goto slow path 744 // The code reduces the window for a race, however, 745 // and thus benefits performance. 746 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 747 jccb (Assembler::zero, LGoSlowPath); 748 749 xorptr(boxReg, boxReg); 750 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 751 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 752 753 // Memory barrier/fence 754 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 755 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 756 // This is faster on Nehalem and AMD Shanghai/Barcelona. 757 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 758 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 759 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 760 lock(); addl(Address(rsp, 0), 0); 761 762 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 763 jccb (Assembler::notZero, LSuccess); 764 765 // Rare inopportune interleaving - race. 766 // The successor vanished in the small window above. 767 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 768 // We need to ensure progress and succession. 769 // Try to reacquire the lock. 770 // If that fails then the new owner is responsible for succession and this 771 // thread needs to take no further action and can exit via the fast path (success). 772 // If the re-acquire succeeds then pass control into the slow path. 773 // As implemented, this latter mode is horrible because we generated more 774 // coherence traffic on the lock *and* artificially extended the critical section 775 // length while by virtue of passing control into the slow path. 776 777 // box is really RAX -- the following CMPXCHG depends on that binding 778 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 779 lock(); 780 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 781 // There's no successor so we tried to regrab the lock. 782 // If that didn't work, then another thread grabbed the 783 // lock so we're done (and exit was a success). 784 jccb (Assembler::notEqual, LSuccess); 785 // Intentional fall-through into slow path 786 787 bind (LGoSlowPath); 788 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 789 jmpb (DONE_LABEL); 790 791 bind (LSuccess); 792 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 793 jmpb (DONE_LABEL); 794 795 if (!UseHeavyMonitors) { 796 bind (Stacked); 797 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 798 lock(); 799 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 800 } 801 #endif 802 bind(DONE_LABEL); 803 } 804 805 //------------------------------------------------------------------------------------------- 806 // Generic instructions support for use in .ad files C2 code generation 807 808 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 809 if (dst != src) { 810 movdqu(dst, src); 811 } 812 if (opcode == Op_AbsVD) { 813 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr); 814 } else { 815 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 816 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr); 817 } 818 } 819 820 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 821 if (opcode == Op_AbsVD) { 822 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr); 823 } else { 824 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 825 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr); 826 } 827 } 828 829 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 830 if (dst != src) { 831 movdqu(dst, src); 832 } 833 if (opcode == Op_AbsVF) { 834 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr); 835 } else { 836 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 837 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr); 838 } 839 } 840 841 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 842 if (opcode == Op_AbsVF) { 843 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr); 844 } else { 845 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 846 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr); 847 } 848 } 849 850 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 851 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 852 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 853 854 if (opcode == Op_MinV) { 855 if (elem_bt == T_BYTE) { 856 pminsb(dst, src); 857 } else if (elem_bt == T_SHORT) { 858 pminsw(dst, src); 859 } else if (elem_bt == T_INT) { 860 pminsd(dst, src); 861 } else { 862 assert(elem_bt == T_LONG, "required"); 863 assert(tmp == xmm0, "required"); 864 assert_different_registers(dst, src, tmp); 865 movdqu(xmm0, dst); 866 pcmpgtq(xmm0, src); 867 blendvpd(dst, src); // xmm0 as mask 868 } 869 } else { // opcode == Op_MaxV 870 if (elem_bt == T_BYTE) { 871 pmaxsb(dst, src); 872 } else if (elem_bt == T_SHORT) { 873 pmaxsw(dst, src); 874 } else if (elem_bt == T_INT) { 875 pmaxsd(dst, src); 876 } else { 877 assert(elem_bt == T_LONG, "required"); 878 assert(tmp == xmm0, "required"); 879 assert_different_registers(dst, src, tmp); 880 movdqu(xmm0, src); 881 pcmpgtq(xmm0, dst); 882 blendvpd(dst, src); // xmm0 as mask 883 } 884 } 885 } 886 887 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 888 XMMRegister dst, XMMRegister src1, XMMRegister src2, 889 int vlen_enc) { 890 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 891 892 if (opcode == Op_MinV) { 893 if (elem_bt == T_BYTE) { 894 vpminsb(dst, src1, src2, vlen_enc); 895 } else if (elem_bt == T_SHORT) { 896 vpminsw(dst, src1, src2, vlen_enc); 897 } else if (elem_bt == T_INT) { 898 vpminsd(dst, src1, src2, vlen_enc); 899 } else { 900 assert(elem_bt == T_LONG, "required"); 901 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 902 vpminsq(dst, src1, src2, vlen_enc); 903 } else { 904 assert_different_registers(dst, src1, src2); 905 vpcmpgtq(dst, src1, src2, vlen_enc); 906 vblendvpd(dst, src1, src2, dst, vlen_enc); 907 } 908 } 909 } else { // opcode == Op_MaxV 910 if (elem_bt == T_BYTE) { 911 vpmaxsb(dst, src1, src2, vlen_enc); 912 } else if (elem_bt == T_SHORT) { 913 vpmaxsw(dst, src1, src2, vlen_enc); 914 } else if (elem_bt == T_INT) { 915 vpmaxsd(dst, src1, src2, vlen_enc); 916 } else { 917 assert(elem_bt == T_LONG, "required"); 918 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 919 vpmaxsq(dst, src1, src2, vlen_enc); 920 } else { 921 assert_different_registers(dst, src1, src2); 922 vpcmpgtq(dst, src1, src2, vlen_enc); 923 vblendvpd(dst, src2, src1, dst, vlen_enc); 924 } 925 } 926 } 927 } 928 929 // Float/Double min max 930 931 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 932 XMMRegister dst, XMMRegister a, XMMRegister b, 933 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 934 int vlen_enc) { 935 assert(UseAVX > 0, "required"); 936 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 937 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 938 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 939 assert_different_registers(a, b, tmp, atmp, btmp); 940 941 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 942 bool is_double_word = is_double_word_type(elem_bt); 943 944 if (!is_double_word && is_min) { 945 vblendvps(atmp, a, b, a, vlen_enc); 946 vblendvps(btmp, b, a, a, vlen_enc); 947 vminps(tmp, atmp, btmp, vlen_enc); 948 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 949 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 950 } else if (!is_double_word && !is_min) { 951 vblendvps(btmp, b, a, b, vlen_enc); 952 vblendvps(atmp, a, b, b, vlen_enc); 953 vmaxps(tmp, atmp, btmp, vlen_enc); 954 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 955 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 956 } else if (is_double_word && is_min) { 957 vblendvpd(atmp, a, b, a, vlen_enc); 958 vblendvpd(btmp, b, a, a, vlen_enc); 959 vminpd(tmp, atmp, btmp, vlen_enc); 960 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 961 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 962 } else { 963 assert(is_double_word && !is_min, "sanity"); 964 vblendvpd(btmp, b, a, b, vlen_enc); 965 vblendvpd(atmp, a, b, b, vlen_enc); 966 vmaxpd(tmp, atmp, btmp, vlen_enc); 967 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 968 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 969 } 970 } 971 972 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 973 XMMRegister dst, XMMRegister a, XMMRegister b, 974 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 975 int vlen_enc) { 976 assert(UseAVX > 2, "required"); 977 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 978 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 979 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 980 assert_different_registers(dst, a, b, atmp, btmp); 981 982 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 983 bool is_double_word = is_double_word_type(elem_bt); 984 bool merge = true; 985 986 if (!is_double_word && is_min) { 987 evpmovd2m(ktmp, a, vlen_enc); 988 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 989 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 990 vminps(dst, atmp, btmp, vlen_enc); 991 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 992 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 993 } else if (!is_double_word && !is_min) { 994 evpmovd2m(ktmp, b, vlen_enc); 995 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 996 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 997 vmaxps(dst, atmp, btmp, vlen_enc); 998 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 999 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1000 } else if (is_double_word && is_min) { 1001 evpmovq2m(ktmp, a, vlen_enc); 1002 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1003 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1004 vminpd(dst, atmp, btmp, vlen_enc); 1005 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1006 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1007 } else { 1008 assert(is_double_word && !is_min, "sanity"); 1009 evpmovq2m(ktmp, b, vlen_enc); 1010 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1011 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1012 vmaxpd(dst, atmp, btmp, vlen_enc); 1013 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1014 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1015 } 1016 } 1017 1018 // Float/Double signum 1019 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, 1020 XMMRegister zero, XMMRegister one, 1021 Register scratch) { 1022 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1023 1024 Label DONE_LABEL; 1025 1026 if (opcode == Op_SignumF) { 1027 assert(UseSSE > 0, "required"); 1028 ucomiss(dst, zero); 1029 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1030 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1031 movflt(dst, one); 1032 jcc(Assembler::above, DONE_LABEL); 1033 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch); 1034 } else if (opcode == Op_SignumD) { 1035 assert(UseSSE > 1, "required"); 1036 ucomisd(dst, zero); 1037 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1038 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1039 movdbl(dst, one); 1040 jcc(Assembler::above, DONE_LABEL); 1041 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch); 1042 } 1043 1044 bind(DONE_LABEL); 1045 } 1046 1047 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1048 if (sign) { 1049 pmovsxbw(dst, src); 1050 } else { 1051 pmovzxbw(dst, src); 1052 } 1053 } 1054 1055 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1056 if (sign) { 1057 vpmovsxbw(dst, src, vector_len); 1058 } else { 1059 vpmovzxbw(dst, src, vector_len); 1060 } 1061 } 1062 1063 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1064 if (sign) { 1065 vpmovsxbd(dst, src, vector_len); 1066 } else { 1067 vpmovzxbd(dst, src, vector_len); 1068 } 1069 } 1070 1071 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1072 if (sign) { 1073 vpmovsxwd(dst, src, vector_len); 1074 } else { 1075 vpmovzxwd(dst, src, vector_len); 1076 } 1077 } 1078 1079 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1080 int shift, int vector_len) { 1081 if (opcode == Op_RotateLeftV) { 1082 if (etype == T_INT) { 1083 evprold(dst, src, shift, vector_len); 1084 } else { 1085 assert(etype == T_LONG, "expected type T_LONG"); 1086 evprolq(dst, src, shift, vector_len); 1087 } 1088 } else { 1089 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1090 if (etype == T_INT) { 1091 evprord(dst, src, shift, vector_len); 1092 } else { 1093 assert(etype == T_LONG, "expected type T_LONG"); 1094 evprorq(dst, src, shift, vector_len); 1095 } 1096 } 1097 } 1098 1099 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1100 XMMRegister shift, int vector_len) { 1101 if (opcode == Op_RotateLeftV) { 1102 if (etype == T_INT) { 1103 evprolvd(dst, src, shift, vector_len); 1104 } else { 1105 assert(etype == T_LONG, "expected type T_LONG"); 1106 evprolvq(dst, src, shift, vector_len); 1107 } 1108 } else { 1109 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1110 if (etype == T_INT) { 1111 evprorvd(dst, src, shift, vector_len); 1112 } else { 1113 assert(etype == T_LONG, "expected type T_LONG"); 1114 evprorvq(dst, src, shift, vector_len); 1115 } 1116 } 1117 } 1118 1119 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1120 if (opcode == Op_RShiftVI) { 1121 psrad(dst, shift); 1122 } else if (opcode == Op_LShiftVI) { 1123 pslld(dst, shift); 1124 } else { 1125 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1126 psrld(dst, shift); 1127 } 1128 } 1129 1130 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1131 switch (opcode) { 1132 case Op_RShiftVI: psrad(dst, shift); break; 1133 case Op_LShiftVI: pslld(dst, shift); break; 1134 case Op_URShiftVI: psrld(dst, shift); break; 1135 1136 default: assert(false, "%s", NodeClassNames[opcode]); 1137 } 1138 } 1139 1140 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1141 if (opcode == Op_RShiftVI) { 1142 vpsrad(dst, nds, shift, vector_len); 1143 } else if (opcode == Op_LShiftVI) { 1144 vpslld(dst, nds, shift, vector_len); 1145 } else { 1146 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1147 vpsrld(dst, nds, shift, vector_len); 1148 } 1149 } 1150 1151 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1152 switch (opcode) { 1153 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1154 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1155 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1156 1157 default: assert(false, "%s", NodeClassNames[opcode]); 1158 } 1159 } 1160 1161 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1162 switch (opcode) { 1163 case Op_RShiftVB: // fall-through 1164 case Op_RShiftVS: psraw(dst, shift); break; 1165 1166 case Op_LShiftVB: // fall-through 1167 case Op_LShiftVS: psllw(dst, shift); break; 1168 1169 case Op_URShiftVS: // fall-through 1170 case Op_URShiftVB: psrlw(dst, shift); break; 1171 1172 default: assert(false, "%s", NodeClassNames[opcode]); 1173 } 1174 } 1175 1176 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1177 switch (opcode) { 1178 case Op_RShiftVB: // fall-through 1179 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1180 1181 case Op_LShiftVB: // fall-through 1182 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1183 1184 case Op_URShiftVS: // fall-through 1185 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1186 1187 default: assert(false, "%s", NodeClassNames[opcode]); 1188 } 1189 } 1190 1191 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1192 switch (opcode) { 1193 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1194 case Op_LShiftVL: psllq(dst, shift); break; 1195 case Op_URShiftVL: psrlq(dst, shift); break; 1196 1197 default: assert(false, "%s", NodeClassNames[opcode]); 1198 } 1199 } 1200 1201 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1202 if (opcode == Op_RShiftVL) { 1203 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1204 } else if (opcode == Op_LShiftVL) { 1205 psllq(dst, shift); 1206 } else { 1207 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1208 psrlq(dst, shift); 1209 } 1210 } 1211 1212 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1213 switch (opcode) { 1214 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1215 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1216 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1217 1218 default: assert(false, "%s", NodeClassNames[opcode]); 1219 } 1220 } 1221 1222 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1223 if (opcode == Op_RShiftVL) { 1224 evpsraq(dst, nds, shift, vector_len); 1225 } else if (opcode == Op_LShiftVL) { 1226 vpsllq(dst, nds, shift, vector_len); 1227 } else { 1228 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1229 vpsrlq(dst, nds, shift, vector_len); 1230 } 1231 } 1232 1233 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1234 switch (opcode) { 1235 case Op_RShiftVB: // fall-through 1236 case Op_RShiftVS: // fall-through 1237 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1238 1239 case Op_LShiftVB: // fall-through 1240 case Op_LShiftVS: // fall-through 1241 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1242 1243 case Op_URShiftVB: // fall-through 1244 case Op_URShiftVS: // fall-through 1245 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1246 1247 default: assert(false, "%s", NodeClassNames[opcode]); 1248 } 1249 } 1250 1251 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1252 switch (opcode) { 1253 case Op_RShiftVB: // fall-through 1254 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1255 1256 case Op_LShiftVB: // fall-through 1257 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1258 1259 case Op_URShiftVB: // fall-through 1260 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1261 1262 default: assert(false, "%s", NodeClassNames[opcode]); 1263 } 1264 } 1265 1266 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1267 assert(UseAVX >= 2, "required"); 1268 switch (opcode) { 1269 case Op_RShiftVL: { 1270 if (UseAVX > 2) { 1271 assert(tmp == xnoreg, "not used"); 1272 if (!VM_Version::supports_avx512vl()) { 1273 vlen_enc = Assembler::AVX_512bit; 1274 } 1275 evpsravq(dst, src, shift, vlen_enc); 1276 } else { 1277 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1278 vpsrlvq(dst, src, shift, vlen_enc); 1279 vpsrlvq(tmp, tmp, shift, vlen_enc); 1280 vpxor(dst, dst, tmp, vlen_enc); 1281 vpsubq(dst, dst, tmp, vlen_enc); 1282 } 1283 break; 1284 } 1285 case Op_LShiftVL: { 1286 assert(tmp == xnoreg, "not used"); 1287 vpsllvq(dst, src, shift, vlen_enc); 1288 break; 1289 } 1290 case Op_URShiftVL: { 1291 assert(tmp == xnoreg, "not used"); 1292 vpsrlvq(dst, src, shift, vlen_enc); 1293 break; 1294 } 1295 default: assert(false, "%s", NodeClassNames[opcode]); 1296 } 1297 } 1298 1299 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1300 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { 1301 assert(opcode == Op_LShiftVB || 1302 opcode == Op_RShiftVB || 1303 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1304 bool sign = (opcode != Op_URShiftVB); 1305 assert(vector_len == 0, "required"); 1306 vextendbd(sign, dst, src, 1); 1307 vpmovzxbd(vtmp, shift, 1); 1308 varshiftd(opcode, dst, dst, vtmp, 1); 1309 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch); 1310 vextracti128_high(vtmp, dst); 1311 vpackusdw(dst, dst, vtmp, 0); 1312 } 1313 1314 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1315 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { 1316 assert(opcode == Op_LShiftVB || 1317 opcode == Op_RShiftVB || 1318 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1319 bool sign = (opcode != Op_URShiftVB); 1320 int ext_vector_len = vector_len + 1; 1321 vextendbw(sign, dst, src, ext_vector_len); 1322 vpmovzxbw(vtmp, shift, ext_vector_len); 1323 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1324 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch); 1325 if (vector_len == 0) { 1326 vextracti128_high(vtmp, dst); 1327 vpackuswb(dst, dst, vtmp, vector_len); 1328 } else { 1329 vextracti64x4_high(vtmp, dst); 1330 vpackuswb(dst, dst, vtmp, vector_len); 1331 vpermq(dst, dst, 0xD8, vector_len); 1332 } 1333 } 1334 1335 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1336 switch(typ) { 1337 case T_BYTE: 1338 pinsrb(dst, val, idx); 1339 break; 1340 case T_SHORT: 1341 pinsrw(dst, val, idx); 1342 break; 1343 case T_INT: 1344 pinsrd(dst, val, idx); 1345 break; 1346 case T_LONG: 1347 pinsrq(dst, val, idx); 1348 break; 1349 default: 1350 assert(false,"Should not reach here."); 1351 break; 1352 } 1353 } 1354 1355 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1356 switch(typ) { 1357 case T_BYTE: 1358 vpinsrb(dst, src, val, idx); 1359 break; 1360 case T_SHORT: 1361 vpinsrw(dst, src, val, idx); 1362 break; 1363 case T_INT: 1364 vpinsrd(dst, src, val, idx); 1365 break; 1366 case T_LONG: 1367 vpinsrq(dst, src, val, idx); 1368 break; 1369 default: 1370 assert(false,"Should not reach here."); 1371 break; 1372 } 1373 } 1374 1375 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1376 switch(typ) { 1377 case T_INT: 1378 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1379 break; 1380 case T_FLOAT: 1381 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1382 break; 1383 case T_LONG: 1384 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1385 break; 1386 case T_DOUBLE: 1387 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1388 break; 1389 default: 1390 assert(false,"Should not reach here."); 1391 break; 1392 } 1393 } 1394 1395 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1396 switch(typ) { 1397 case T_INT: 1398 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1399 break; 1400 case T_FLOAT: 1401 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1402 break; 1403 case T_LONG: 1404 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1405 break; 1406 case T_DOUBLE: 1407 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1408 break; 1409 default: 1410 assert(false,"Should not reach here."); 1411 break; 1412 } 1413 } 1414 1415 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1416 switch(typ) { 1417 case T_INT: 1418 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1419 break; 1420 case T_FLOAT: 1421 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1422 break; 1423 case T_LONG: 1424 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1425 break; 1426 case T_DOUBLE: 1427 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1428 break; 1429 default: 1430 assert(false,"Should not reach here."); 1431 break; 1432 } 1433 } 1434 1435 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1436 if (vlen_in_bytes <= 16) { 1437 pxor (dst, dst); 1438 psubb(dst, src); 1439 switch (elem_bt) { 1440 case T_BYTE: /* nothing to do */ break; 1441 case T_SHORT: pmovsxbw(dst, dst); break; 1442 case T_INT: pmovsxbd(dst, dst); break; 1443 case T_FLOAT: pmovsxbd(dst, dst); break; 1444 case T_LONG: pmovsxbq(dst, dst); break; 1445 case T_DOUBLE: pmovsxbq(dst, dst); break; 1446 1447 default: assert(false, "%s", type2name(elem_bt)); 1448 } 1449 } else { 1450 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1451 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1452 1453 vpxor (dst, dst, dst, vlen_enc); 1454 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1455 1456 switch (elem_bt) { 1457 case T_BYTE: /* nothing to do */ break; 1458 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1459 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1460 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1461 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1462 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1463 1464 default: assert(false, "%s", type2name(elem_bt)); 1465 } 1466 } 1467 } 1468 1469 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, 1470 Register tmp, bool novlbwdq, int vlen_enc) { 1471 if (novlbwdq) { 1472 vpmovsxbd(xtmp, src, vlen_enc); 1473 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1474 Assembler::eq, true, vlen_enc, tmp); 1475 } else { 1476 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1477 vpsubb(xtmp, xtmp, src, vlen_enc); 1478 evpmovb2m(dst, xtmp, vlen_enc); 1479 } 1480 } 1481 1482 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1483 switch (vlen_in_bytes) { 1484 case 4: movdl(dst, src); break; 1485 case 8: movq(dst, src); break; 1486 case 16: movdqu(dst, src); break; 1487 case 32: vmovdqu(dst, src); break; 1488 case 64: evmovdquq(dst, src, Assembler::AVX_512bit); break; 1489 default: ShouldNotReachHere(); 1490 } 1491 } 1492 1493 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1494 if (reachable(src)) { 1495 load_vector(dst, as_Address(src), vlen_in_bytes); 1496 } else { 1497 lea(rscratch, src); 1498 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1499 } 1500 } 1501 1502 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) { 1503 ExternalAddress addr(StubRoutines::x86::vector_iota_indices()); 1504 if (vlen_in_bytes == 4) { 1505 movdl(dst, addr); 1506 } else if (vlen_in_bytes == 8) { 1507 movq(dst, addr); 1508 } else if (vlen_in_bytes == 16) { 1509 movdqu(dst, addr, scratch); 1510 } else if (vlen_in_bytes == 32) { 1511 vmovdqu(dst, addr, scratch); 1512 } else { 1513 assert(vlen_in_bytes == 64, "%d", vlen_in_bytes); 1514 evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch); 1515 } 1516 } 1517 1518 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1519 1520 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1521 int vector_len = Assembler::AVX_128bit; 1522 1523 switch (opcode) { 1524 case Op_AndReductionV: pand(dst, src); break; 1525 case Op_OrReductionV: por (dst, src); break; 1526 case Op_XorReductionV: pxor(dst, src); break; 1527 case Op_MinReductionV: 1528 switch (typ) { 1529 case T_BYTE: pminsb(dst, src); break; 1530 case T_SHORT: pminsw(dst, src); break; 1531 case T_INT: pminsd(dst, src); break; 1532 case T_LONG: assert(UseAVX > 2, "required"); 1533 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1534 default: assert(false, "wrong type"); 1535 } 1536 break; 1537 case Op_MaxReductionV: 1538 switch (typ) { 1539 case T_BYTE: pmaxsb(dst, src); break; 1540 case T_SHORT: pmaxsw(dst, src); break; 1541 case T_INT: pmaxsd(dst, src); break; 1542 case T_LONG: assert(UseAVX > 2, "required"); 1543 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1544 default: assert(false, "wrong type"); 1545 } 1546 break; 1547 case Op_AddReductionVF: addss(dst, src); break; 1548 case Op_AddReductionVD: addsd(dst, src); break; 1549 case Op_AddReductionVI: 1550 switch (typ) { 1551 case T_BYTE: paddb(dst, src); break; 1552 case T_SHORT: paddw(dst, src); break; 1553 case T_INT: paddd(dst, src); break; 1554 default: assert(false, "wrong type"); 1555 } 1556 break; 1557 case Op_AddReductionVL: paddq(dst, src); break; 1558 case Op_MulReductionVF: mulss(dst, src); break; 1559 case Op_MulReductionVD: mulsd(dst, src); break; 1560 case Op_MulReductionVI: 1561 switch (typ) { 1562 case T_SHORT: pmullw(dst, src); break; 1563 case T_INT: pmulld(dst, src); break; 1564 default: assert(false, "wrong type"); 1565 } 1566 break; 1567 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1568 vpmullq(dst, dst, src, vector_len); break; 1569 default: assert(false, "wrong opcode"); 1570 } 1571 } 1572 1573 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1574 int vector_len = Assembler::AVX_256bit; 1575 1576 switch (opcode) { 1577 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1578 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1579 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1580 case Op_MinReductionV: 1581 switch (typ) { 1582 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1583 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1584 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1585 case T_LONG: assert(UseAVX > 2, "required"); 1586 vpminsq(dst, src1, src2, vector_len); break; 1587 default: assert(false, "wrong type"); 1588 } 1589 break; 1590 case Op_MaxReductionV: 1591 switch (typ) { 1592 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1593 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1594 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1595 case T_LONG: assert(UseAVX > 2, "required"); 1596 vpmaxsq(dst, src1, src2, vector_len); break; 1597 default: assert(false, "wrong type"); 1598 } 1599 break; 1600 case Op_AddReductionVI: 1601 switch (typ) { 1602 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1603 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1604 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1605 default: assert(false, "wrong type"); 1606 } 1607 break; 1608 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1609 case Op_MulReductionVI: 1610 switch (typ) { 1611 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1612 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1613 default: assert(false, "wrong type"); 1614 } 1615 break; 1616 case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break; 1617 default: assert(false, "wrong opcode"); 1618 } 1619 } 1620 1621 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1622 XMMRegister dst, XMMRegister src, 1623 XMMRegister vtmp1, XMMRegister vtmp2) { 1624 switch (opcode) { 1625 case Op_AddReductionVF: 1626 case Op_MulReductionVF: 1627 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1628 break; 1629 1630 case Op_AddReductionVD: 1631 case Op_MulReductionVD: 1632 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1633 break; 1634 1635 default: assert(false, "wrong opcode"); 1636 } 1637 } 1638 1639 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1640 Register dst, Register src1, XMMRegister src2, 1641 XMMRegister vtmp1, XMMRegister vtmp2) { 1642 switch (vlen) { 1643 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1644 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1645 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1646 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1647 1648 default: assert(false, "wrong vector length"); 1649 } 1650 } 1651 1652 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1653 Register dst, Register src1, XMMRegister src2, 1654 XMMRegister vtmp1, XMMRegister vtmp2) { 1655 switch (vlen) { 1656 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1657 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1658 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1659 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1660 1661 default: assert(false, "wrong vector length"); 1662 } 1663 } 1664 1665 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1666 Register dst, Register src1, XMMRegister src2, 1667 XMMRegister vtmp1, XMMRegister vtmp2) { 1668 switch (vlen) { 1669 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1670 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1671 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1672 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1673 1674 default: assert(false, "wrong vector length"); 1675 } 1676 } 1677 1678 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1679 Register dst, Register src1, XMMRegister src2, 1680 XMMRegister vtmp1, XMMRegister vtmp2) { 1681 switch (vlen) { 1682 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1683 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1684 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1685 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1686 1687 default: assert(false, "wrong vector length"); 1688 } 1689 } 1690 1691 #ifdef _LP64 1692 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1693 Register dst, Register src1, XMMRegister src2, 1694 XMMRegister vtmp1, XMMRegister vtmp2) { 1695 switch (vlen) { 1696 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1697 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1698 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1699 1700 default: assert(false, "wrong vector length"); 1701 } 1702 } 1703 #endif // _LP64 1704 1705 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1706 switch (vlen) { 1707 case 2: 1708 assert(vtmp2 == xnoreg, ""); 1709 reduce2F(opcode, dst, src, vtmp1); 1710 break; 1711 case 4: 1712 assert(vtmp2 == xnoreg, ""); 1713 reduce4F(opcode, dst, src, vtmp1); 1714 break; 1715 case 8: 1716 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1717 break; 1718 case 16: 1719 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1720 break; 1721 default: assert(false, "wrong vector length"); 1722 } 1723 } 1724 1725 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1726 switch (vlen) { 1727 case 2: 1728 assert(vtmp2 == xnoreg, ""); 1729 reduce2D(opcode, dst, src, vtmp1); 1730 break; 1731 case 4: 1732 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1733 break; 1734 case 8: 1735 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1736 break; 1737 default: assert(false, "wrong vector length"); 1738 } 1739 } 1740 1741 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1742 if (opcode == Op_AddReductionVI) { 1743 if (vtmp1 != src2) { 1744 movdqu(vtmp1, src2); 1745 } 1746 phaddd(vtmp1, vtmp1); 1747 } else { 1748 pshufd(vtmp1, src2, 0x1); 1749 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1750 } 1751 movdl(vtmp2, src1); 1752 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1753 movdl(dst, vtmp1); 1754 } 1755 1756 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1757 if (opcode == Op_AddReductionVI) { 1758 if (vtmp1 != src2) { 1759 movdqu(vtmp1, src2); 1760 } 1761 phaddd(vtmp1, src2); 1762 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1763 } else { 1764 pshufd(vtmp2, src2, 0xE); 1765 reduce_operation_128(T_INT, opcode, vtmp2, src2); 1766 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1767 } 1768 } 1769 1770 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1771 if (opcode == Op_AddReductionVI) { 1772 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1773 vextracti128_high(vtmp2, vtmp1); 1774 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1775 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1776 } else { 1777 vextracti128_high(vtmp1, src2); 1778 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1779 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1780 } 1781 } 1782 1783 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1784 vextracti64x4_high(vtmp2, src2); 1785 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 1786 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1787 } 1788 1789 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1790 pshufd(vtmp2, src2, 0x1); 1791 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1792 movdqu(vtmp1, vtmp2); 1793 psrldq(vtmp1, 2); 1794 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1795 movdqu(vtmp2, vtmp1); 1796 psrldq(vtmp2, 1); 1797 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1798 movdl(vtmp2, src1); 1799 pmovsxbd(vtmp1, vtmp1); 1800 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1801 pextrb(dst, vtmp1, 0x0); 1802 movsbl(dst, dst); 1803 } 1804 1805 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1806 pshufd(vtmp1, src2, 0xE); 1807 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 1808 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1809 } 1810 1811 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1812 vextracti128_high(vtmp2, src2); 1813 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1814 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1815 } 1816 1817 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1818 vextracti64x4_high(vtmp1, src2); 1819 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 1820 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1821 } 1822 1823 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1824 pmovsxbw(vtmp2, src2); 1825 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1826 } 1827 1828 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1829 if (UseAVX > 1) { 1830 int vector_len = Assembler::AVX_256bit; 1831 vpmovsxbw(vtmp1, src2, vector_len); 1832 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1833 } else { 1834 pmovsxbw(vtmp2, src2); 1835 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1836 pshufd(vtmp2, src2, 0x1); 1837 pmovsxbw(vtmp2, src2); 1838 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1839 } 1840 } 1841 1842 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1843 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 1844 int vector_len = Assembler::AVX_512bit; 1845 vpmovsxbw(vtmp1, src2, vector_len); 1846 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1847 } else { 1848 assert(UseAVX >= 2,"Should not reach here."); 1849 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 1850 vextracti128_high(vtmp2, src2); 1851 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1852 } 1853 } 1854 1855 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1856 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 1857 vextracti64x4_high(vtmp2, src2); 1858 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1859 } 1860 1861 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1862 if (opcode == Op_AddReductionVI) { 1863 if (vtmp1 != src2) { 1864 movdqu(vtmp1, src2); 1865 } 1866 phaddw(vtmp1, vtmp1); 1867 phaddw(vtmp1, vtmp1); 1868 } else { 1869 pshufd(vtmp2, src2, 0x1); 1870 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 1871 movdqu(vtmp1, vtmp2); 1872 psrldq(vtmp1, 2); 1873 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 1874 } 1875 movdl(vtmp2, src1); 1876 pmovsxwd(vtmp1, vtmp1); 1877 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1878 pextrw(dst, vtmp1, 0x0); 1879 movswl(dst, dst); 1880 } 1881 1882 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1883 if (opcode == Op_AddReductionVI) { 1884 if (vtmp1 != src2) { 1885 movdqu(vtmp1, src2); 1886 } 1887 phaddw(vtmp1, src2); 1888 } else { 1889 pshufd(vtmp1, src2, 0xE); 1890 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 1891 } 1892 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1893 } 1894 1895 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1896 if (opcode == Op_AddReductionVI) { 1897 int vector_len = Assembler::AVX_256bit; 1898 vphaddw(vtmp2, src2, src2, vector_len); 1899 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 1900 } else { 1901 vextracti128_high(vtmp2, src2); 1902 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 1903 } 1904 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1905 } 1906 1907 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1908 int vector_len = Assembler::AVX_256bit; 1909 vextracti64x4_high(vtmp1, src2); 1910 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 1911 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1912 } 1913 1914 #ifdef _LP64 1915 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1916 pshufd(vtmp2, src2, 0xE); 1917 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 1918 movdq(vtmp1, src1); 1919 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 1920 movdq(dst, vtmp1); 1921 } 1922 1923 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1924 vextracti128_high(vtmp1, src2); 1925 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 1926 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1927 } 1928 1929 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1930 vextracti64x4_high(vtmp2, src2); 1931 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 1932 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1933 } 1934 1935 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 1936 mov64(temp, -1L); 1937 bzhiq(temp, temp, len); 1938 kmovql(dst, temp); 1939 } 1940 #endif // _LP64 1941 1942 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1943 reduce_operation_128(T_FLOAT, opcode, dst, src); 1944 pshufd(vtmp, src, 0x1); 1945 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1946 } 1947 1948 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1949 reduce2F(opcode, dst, src, vtmp); 1950 pshufd(vtmp, src, 0x2); 1951 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1952 pshufd(vtmp, src, 0x3); 1953 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1954 } 1955 1956 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1957 reduce4F(opcode, dst, src, vtmp2); 1958 vextractf128_high(vtmp2, src); 1959 reduce4F(opcode, dst, vtmp2, vtmp1); 1960 } 1961 1962 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1963 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1964 vextracti64x4_high(vtmp1, src); 1965 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 1966 } 1967 1968 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1969 reduce_operation_128(T_DOUBLE, opcode, dst, src); 1970 pshufd(vtmp, src, 0xE); 1971 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 1972 } 1973 1974 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1975 reduce2D(opcode, dst, src, vtmp2); 1976 vextractf128_high(vtmp2, src); 1977 reduce2D(opcode, dst, vtmp2, vtmp1); 1978 } 1979 1980 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1981 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1982 vextracti64x4_high(vtmp1, src); 1983 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 1984 } 1985 1986 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) { 1987 MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len); 1988 } 1989 1990 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) { 1991 MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len); 1992 } 1993 1994 1995 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 1996 XMMRegister dst, XMMRegister src, 1997 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1998 XMMRegister xmm_0, XMMRegister xmm_1) { 1999 int permconst[] = {1, 14}; 2000 XMMRegister wsrc = src; 2001 XMMRegister wdst = xmm_0; 2002 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2003 2004 int vlen_enc = Assembler::AVX_128bit; 2005 if (vlen == 16) { 2006 vlen_enc = Assembler::AVX_256bit; 2007 } 2008 2009 for (int i = log2(vlen) - 1; i >=0; i--) { 2010 if (i == 0 && !is_dst_valid) { 2011 wdst = dst; 2012 } 2013 if (i == 3) { 2014 vextracti64x4_high(wtmp, wsrc); 2015 } else if (i == 2) { 2016 vextracti128_high(wtmp, wsrc); 2017 } else { // i = [0,1] 2018 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2019 } 2020 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2021 wsrc = wdst; 2022 vlen_enc = Assembler::AVX_128bit; 2023 } 2024 if (is_dst_valid) { 2025 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2026 } 2027 } 2028 2029 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2030 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2031 XMMRegister xmm_0, XMMRegister xmm_1) { 2032 XMMRegister wsrc = src; 2033 XMMRegister wdst = xmm_0; 2034 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2035 int vlen_enc = Assembler::AVX_128bit; 2036 if (vlen == 8) { 2037 vlen_enc = Assembler::AVX_256bit; 2038 } 2039 for (int i = log2(vlen) - 1; i >=0; i--) { 2040 if (i == 0 && !is_dst_valid) { 2041 wdst = dst; 2042 } 2043 if (i == 1) { 2044 vextracti128_high(wtmp, wsrc); 2045 } else if (i == 2) { 2046 vextracti64x4_high(wtmp, wsrc); 2047 } else { 2048 assert(i == 0, "%d", i); 2049 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2050 } 2051 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2052 wsrc = wdst; 2053 vlen_enc = Assembler::AVX_128bit; 2054 } 2055 if (is_dst_valid) { 2056 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2057 } 2058 } 2059 2060 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2061 switch (bt) { 2062 case T_BYTE: pextrb(dst, src, idx); break; 2063 case T_SHORT: pextrw(dst, src, idx); break; 2064 case T_INT: pextrd(dst, src, idx); break; 2065 case T_LONG: pextrq(dst, src, idx); break; 2066 2067 default: 2068 assert(false,"Should not reach here."); 2069 break; 2070 } 2071 } 2072 2073 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2074 int esize = type2aelembytes(typ); 2075 int elem_per_lane = 16/esize; 2076 int lane = elemindex / elem_per_lane; 2077 int eindex = elemindex % elem_per_lane; 2078 2079 if (lane >= 2) { 2080 assert(UseAVX > 2, "required"); 2081 vextractf32x4(dst, src, lane & 3); 2082 return dst; 2083 } else if (lane > 0) { 2084 assert(UseAVX > 0, "required"); 2085 vextractf128(dst, src, lane); 2086 return dst; 2087 } else { 2088 return src; 2089 } 2090 } 2091 2092 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2093 int esize = type2aelembytes(typ); 2094 int elem_per_lane = 16/esize; 2095 int eindex = elemindex % elem_per_lane; 2096 assert(is_integral_type(typ),"required"); 2097 2098 if (eindex == 0) { 2099 if (typ == T_LONG) { 2100 movq(dst, src); 2101 } else { 2102 movdl(dst, src); 2103 if (typ == T_BYTE) 2104 movsbl(dst, dst); 2105 else if (typ == T_SHORT) 2106 movswl(dst, dst); 2107 } 2108 } else { 2109 extract(typ, dst, src, eindex); 2110 } 2111 } 2112 2113 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) { 2114 int esize = type2aelembytes(typ); 2115 int elem_per_lane = 16/esize; 2116 int eindex = elemindex % elem_per_lane; 2117 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2118 2119 if (eindex == 0) { 2120 movq(dst, src); 2121 } else { 2122 if (typ == T_FLOAT) { 2123 if (UseAVX == 0) { 2124 movdqu(dst, src); 2125 pshufps(dst, dst, eindex); 2126 } else { 2127 vpshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2128 } 2129 } else { 2130 if (UseAVX == 0) { 2131 movdqu(dst, src); 2132 psrldq(dst, eindex*esize); 2133 } else { 2134 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2135 } 2136 movq(dst, dst); 2137 } 2138 } 2139 // Zero upper bits 2140 if (typ == T_FLOAT) { 2141 if (UseAVX == 0) { 2142 assert((vtmp != xnoreg) && (tmp != noreg), "required."); 2143 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp); 2144 pand(dst, vtmp); 2145 } else { 2146 assert((tmp != noreg), "required."); 2147 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp); 2148 } 2149 } 2150 } 2151 2152 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2153 switch(typ) { 2154 case T_BYTE: 2155 case T_BOOLEAN: 2156 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2157 break; 2158 case T_SHORT: 2159 case T_CHAR: 2160 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2161 break; 2162 case T_INT: 2163 case T_FLOAT: 2164 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2165 break; 2166 case T_LONG: 2167 case T_DOUBLE: 2168 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2169 break; 2170 default: 2171 assert(false,"Should not reach here."); 2172 break; 2173 } 2174 } 2175 2176 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) { 2177 switch(typ) { 2178 case T_BOOLEAN: 2179 case T_BYTE: 2180 evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2181 break; 2182 case T_CHAR: 2183 case T_SHORT: 2184 evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2185 break; 2186 case T_INT: 2187 case T_FLOAT: 2188 evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2189 break; 2190 case T_LONG: 2191 case T_DOUBLE: 2192 evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2193 break; 2194 default: 2195 assert(false,"Should not reach here."); 2196 break; 2197 } 2198 } 2199 2200 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2201 switch(typ) { 2202 case T_BYTE: 2203 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2204 break; 2205 case T_SHORT: 2206 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2207 break; 2208 case T_INT: 2209 case T_FLOAT: 2210 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2211 break; 2212 case T_LONG: 2213 case T_DOUBLE: 2214 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2215 break; 2216 default: 2217 assert(false,"Should not reach here."); 2218 break; 2219 } 2220 } 2221 2222 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2, 2223 XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) { 2224 switch(vlen) { 2225 case 4: 2226 assert(vtmp1 != xnoreg, "required."); 2227 // Broadcast lower 32 bits to 128 bits before ptest 2228 pshufd(vtmp1, src1, 0x0); 2229 if (bt == BoolTest::overflow) { 2230 assert(vtmp2 != xnoreg, "required."); 2231 pshufd(vtmp2, src2, 0x0); 2232 } else { 2233 assert(vtmp2 == xnoreg, "required."); 2234 vtmp2 = src2; 2235 } 2236 ptest(vtmp1, vtmp2); 2237 break; 2238 case 8: 2239 assert(vtmp1 != xnoreg, "required."); 2240 // Broadcast lower 64 bits to 128 bits before ptest 2241 pshufd(vtmp1, src1, 0x4); 2242 if (bt == BoolTest::overflow) { 2243 assert(vtmp2 != xnoreg, "required."); 2244 pshufd(vtmp2, src2, 0x4); 2245 } else { 2246 assert(vtmp2 == xnoreg, "required."); 2247 vtmp2 = src2; 2248 } 2249 ptest(vtmp1, vtmp2); 2250 break; 2251 case 16: 2252 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2253 ptest(src1, src2); 2254 break; 2255 case 32: 2256 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2257 vptest(src1, src2, Assembler::AVX_256bit); 2258 break; 2259 case 64: 2260 { 2261 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2262 evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit); 2263 if (bt == BoolTest::ne) { 2264 ktestql(mask, mask); 2265 } else { 2266 assert(bt == BoolTest::overflow, "required"); 2267 kortestql(mask, mask); 2268 } 2269 } 2270 break; 2271 default: 2272 assert(false,"Should not reach here."); 2273 break; 2274 } 2275 } 2276 2277 //------------------------------------------------------------------------------------------- 2278 2279 // IndexOf for constant substrings with size >= 8 chars 2280 // which don't need to be loaded through stack. 2281 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2282 Register cnt1, Register cnt2, 2283 int int_cnt2, Register result, 2284 XMMRegister vec, Register tmp, 2285 int ae) { 2286 ShortBranchVerifier sbv(this); 2287 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2288 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2289 2290 // This method uses the pcmpestri instruction with bound registers 2291 // inputs: 2292 // xmm - substring 2293 // rax - substring length (elements count) 2294 // mem - scanned string 2295 // rdx - string length (elements count) 2296 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2297 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2298 // outputs: 2299 // rcx - matched index in string 2300 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2301 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2302 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2303 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2304 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2305 2306 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2307 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2308 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2309 2310 // Note, inline_string_indexOf() generates checks: 2311 // if (substr.count > string.count) return -1; 2312 // if (substr.count == 0) return 0; 2313 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2314 2315 // Load substring. 2316 if (ae == StrIntrinsicNode::UL) { 2317 pmovzxbw(vec, Address(str2, 0)); 2318 } else { 2319 movdqu(vec, Address(str2, 0)); 2320 } 2321 movl(cnt2, int_cnt2); 2322 movptr(result, str1); // string addr 2323 2324 if (int_cnt2 > stride) { 2325 jmpb(SCAN_TO_SUBSTR); 2326 2327 // Reload substr for rescan, this code 2328 // is executed only for large substrings (> 8 chars) 2329 bind(RELOAD_SUBSTR); 2330 if (ae == StrIntrinsicNode::UL) { 2331 pmovzxbw(vec, Address(str2, 0)); 2332 } else { 2333 movdqu(vec, Address(str2, 0)); 2334 } 2335 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2336 2337 bind(RELOAD_STR); 2338 // We came here after the beginning of the substring was 2339 // matched but the rest of it was not so we need to search 2340 // again. Start from the next element after the previous match. 2341 2342 // cnt2 is number of substring reminding elements and 2343 // cnt1 is number of string reminding elements when cmp failed. 2344 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2345 subl(cnt1, cnt2); 2346 addl(cnt1, int_cnt2); 2347 movl(cnt2, int_cnt2); // Now restore cnt2 2348 2349 decrementl(cnt1); // Shift to next element 2350 cmpl(cnt1, cnt2); 2351 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2352 2353 addptr(result, (1<<scale1)); 2354 2355 } // (int_cnt2 > 8) 2356 2357 // Scan string for start of substr in 16-byte vectors 2358 bind(SCAN_TO_SUBSTR); 2359 pcmpestri(vec, Address(result, 0), mode); 2360 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2361 subl(cnt1, stride); 2362 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2363 cmpl(cnt1, cnt2); 2364 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2365 addptr(result, 16); 2366 jmpb(SCAN_TO_SUBSTR); 2367 2368 // Found a potential substr 2369 bind(FOUND_CANDIDATE); 2370 // Matched whole vector if first element matched (tmp(rcx) == 0). 2371 if (int_cnt2 == stride) { 2372 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2373 } else { // int_cnt2 > 8 2374 jccb(Assembler::overflow, FOUND_SUBSTR); 2375 } 2376 // After pcmpestri tmp(rcx) contains matched element index 2377 // Compute start addr of substr 2378 lea(result, Address(result, tmp, scale1)); 2379 2380 // Make sure string is still long enough 2381 subl(cnt1, tmp); 2382 cmpl(cnt1, cnt2); 2383 if (int_cnt2 == stride) { 2384 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2385 } else { // int_cnt2 > 8 2386 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2387 } 2388 // Left less then substring. 2389 2390 bind(RET_NOT_FOUND); 2391 movl(result, -1); 2392 jmp(EXIT); 2393 2394 if (int_cnt2 > stride) { 2395 // This code is optimized for the case when whole substring 2396 // is matched if its head is matched. 2397 bind(MATCH_SUBSTR_HEAD); 2398 pcmpestri(vec, Address(result, 0), mode); 2399 // Reload only string if does not match 2400 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2401 2402 Label CONT_SCAN_SUBSTR; 2403 // Compare the rest of substring (> 8 chars). 2404 bind(FOUND_SUBSTR); 2405 // First 8 chars are already matched. 2406 negptr(cnt2); 2407 addptr(cnt2, stride); 2408 2409 bind(SCAN_SUBSTR); 2410 subl(cnt1, stride); 2411 cmpl(cnt2, -stride); // Do not read beyond substring 2412 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2413 // Back-up strings to avoid reading beyond substring: 2414 // cnt1 = cnt1 - cnt2 + 8 2415 addl(cnt1, cnt2); // cnt2 is negative 2416 addl(cnt1, stride); 2417 movl(cnt2, stride); negptr(cnt2); 2418 bind(CONT_SCAN_SUBSTR); 2419 if (int_cnt2 < (int)G) { 2420 int tail_off1 = int_cnt2<<scale1; 2421 int tail_off2 = int_cnt2<<scale2; 2422 if (ae == StrIntrinsicNode::UL) { 2423 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2424 } else { 2425 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2426 } 2427 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2428 } else { 2429 // calculate index in register to avoid integer overflow (int_cnt2*2) 2430 movl(tmp, int_cnt2); 2431 addptr(tmp, cnt2); 2432 if (ae == StrIntrinsicNode::UL) { 2433 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2434 } else { 2435 movdqu(vec, Address(str2, tmp, scale2, 0)); 2436 } 2437 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2438 } 2439 // Need to reload strings pointers if not matched whole vector 2440 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2441 addptr(cnt2, stride); 2442 jcc(Assembler::negative, SCAN_SUBSTR); 2443 // Fall through if found full substring 2444 2445 } // (int_cnt2 > 8) 2446 2447 bind(RET_FOUND); 2448 // Found result if we matched full small substring. 2449 // Compute substr offset 2450 subptr(result, str1); 2451 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2452 shrl(result, 1); // index 2453 } 2454 bind(EXIT); 2455 2456 } // string_indexofC8 2457 2458 // Small strings are loaded through stack if they cross page boundary. 2459 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2460 Register cnt1, Register cnt2, 2461 int int_cnt2, Register result, 2462 XMMRegister vec, Register tmp, 2463 int ae) { 2464 ShortBranchVerifier sbv(this); 2465 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2466 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2467 2468 // 2469 // int_cnt2 is length of small (< 8 chars) constant substring 2470 // or (-1) for non constant substring in which case its length 2471 // is in cnt2 register. 2472 // 2473 // Note, inline_string_indexOf() generates checks: 2474 // if (substr.count > string.count) return -1; 2475 // if (substr.count == 0) return 0; 2476 // 2477 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2478 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2479 // This method uses the pcmpestri instruction with bound registers 2480 // inputs: 2481 // xmm - substring 2482 // rax - substring length (elements count) 2483 // mem - scanned string 2484 // rdx - string length (elements count) 2485 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2486 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2487 // outputs: 2488 // rcx - matched index in string 2489 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2490 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2491 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2492 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2493 2494 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2495 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2496 FOUND_CANDIDATE; 2497 2498 { //======================================================== 2499 // We don't know where these strings are located 2500 // and we can't read beyond them. Load them through stack. 2501 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2502 2503 movptr(tmp, rsp); // save old SP 2504 2505 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2506 if (int_cnt2 == (1>>scale2)) { // One byte 2507 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2508 load_unsigned_byte(result, Address(str2, 0)); 2509 movdl(vec, result); // move 32 bits 2510 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2511 // Not enough header space in 32-bit VM: 12+3 = 15. 2512 movl(result, Address(str2, -1)); 2513 shrl(result, 8); 2514 movdl(vec, result); // move 32 bits 2515 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2516 load_unsigned_short(result, Address(str2, 0)); 2517 movdl(vec, result); // move 32 bits 2518 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2519 movdl(vec, Address(str2, 0)); // move 32 bits 2520 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2521 movq(vec, Address(str2, 0)); // move 64 bits 2522 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2523 // Array header size is 12 bytes in 32-bit VM 2524 // + 6 bytes for 3 chars == 18 bytes, 2525 // enough space to load vec and shift. 2526 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2527 if (ae == StrIntrinsicNode::UL) { 2528 int tail_off = int_cnt2-8; 2529 pmovzxbw(vec, Address(str2, tail_off)); 2530 psrldq(vec, -2*tail_off); 2531 } 2532 else { 2533 int tail_off = int_cnt2*(1<<scale2); 2534 movdqu(vec, Address(str2, tail_off-16)); 2535 psrldq(vec, 16-tail_off); 2536 } 2537 } 2538 } else { // not constant substring 2539 cmpl(cnt2, stride); 2540 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2541 2542 // We can read beyond string if srt+16 does not cross page boundary 2543 // since heaps are aligned and mapped by pages. 2544 assert(os::vm_page_size() < (int)G, "default page should be small"); 2545 movl(result, str2); // We need only low 32 bits 2546 andl(result, (os::vm_page_size()-1)); 2547 cmpl(result, (os::vm_page_size()-16)); 2548 jccb(Assembler::belowEqual, CHECK_STR); 2549 2550 // Move small strings to stack to allow load 16 bytes into vec. 2551 subptr(rsp, 16); 2552 int stk_offset = wordSize-(1<<scale2); 2553 push(cnt2); 2554 2555 bind(COPY_SUBSTR); 2556 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2557 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2558 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2559 } else if (ae == StrIntrinsicNode::UU) { 2560 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2561 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2562 } 2563 decrement(cnt2); 2564 jccb(Assembler::notZero, COPY_SUBSTR); 2565 2566 pop(cnt2); 2567 movptr(str2, rsp); // New substring address 2568 } // non constant 2569 2570 bind(CHECK_STR); 2571 cmpl(cnt1, stride); 2572 jccb(Assembler::aboveEqual, BIG_STRINGS); 2573 2574 // Check cross page boundary. 2575 movl(result, str1); // We need only low 32 bits 2576 andl(result, (os::vm_page_size()-1)); 2577 cmpl(result, (os::vm_page_size()-16)); 2578 jccb(Assembler::belowEqual, BIG_STRINGS); 2579 2580 subptr(rsp, 16); 2581 int stk_offset = -(1<<scale1); 2582 if (int_cnt2 < 0) { // not constant 2583 push(cnt2); 2584 stk_offset += wordSize; 2585 } 2586 movl(cnt2, cnt1); 2587 2588 bind(COPY_STR); 2589 if (ae == StrIntrinsicNode::LL) { 2590 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2591 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2592 } else { 2593 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2594 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2595 } 2596 decrement(cnt2); 2597 jccb(Assembler::notZero, COPY_STR); 2598 2599 if (int_cnt2 < 0) { // not constant 2600 pop(cnt2); 2601 } 2602 movptr(str1, rsp); // New string address 2603 2604 bind(BIG_STRINGS); 2605 // Load substring. 2606 if (int_cnt2 < 0) { // -1 2607 if (ae == StrIntrinsicNode::UL) { 2608 pmovzxbw(vec, Address(str2, 0)); 2609 } else { 2610 movdqu(vec, Address(str2, 0)); 2611 } 2612 push(cnt2); // substr count 2613 push(str2); // substr addr 2614 push(str1); // string addr 2615 } else { 2616 // Small (< 8 chars) constant substrings are loaded already. 2617 movl(cnt2, int_cnt2); 2618 } 2619 push(tmp); // original SP 2620 2621 } // Finished loading 2622 2623 //======================================================== 2624 // Start search 2625 // 2626 2627 movptr(result, str1); // string addr 2628 2629 if (int_cnt2 < 0) { // Only for non constant substring 2630 jmpb(SCAN_TO_SUBSTR); 2631 2632 // SP saved at sp+0 2633 // String saved at sp+1*wordSize 2634 // Substr saved at sp+2*wordSize 2635 // Substr count saved at sp+3*wordSize 2636 2637 // Reload substr for rescan, this code 2638 // is executed only for large substrings (> 8 chars) 2639 bind(RELOAD_SUBSTR); 2640 movptr(str2, Address(rsp, 2*wordSize)); 2641 movl(cnt2, Address(rsp, 3*wordSize)); 2642 if (ae == StrIntrinsicNode::UL) { 2643 pmovzxbw(vec, Address(str2, 0)); 2644 } else { 2645 movdqu(vec, Address(str2, 0)); 2646 } 2647 // We came here after the beginning of the substring was 2648 // matched but the rest of it was not so we need to search 2649 // again. Start from the next element after the previous match. 2650 subptr(str1, result); // Restore counter 2651 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2652 shrl(str1, 1); 2653 } 2654 addl(cnt1, str1); 2655 decrementl(cnt1); // Shift to next element 2656 cmpl(cnt1, cnt2); 2657 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2658 2659 addptr(result, (1<<scale1)); 2660 } // non constant 2661 2662 // Scan string for start of substr in 16-byte vectors 2663 bind(SCAN_TO_SUBSTR); 2664 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2665 pcmpestri(vec, Address(result, 0), mode); 2666 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2667 subl(cnt1, stride); 2668 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2669 cmpl(cnt1, cnt2); 2670 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2671 addptr(result, 16); 2672 2673 bind(ADJUST_STR); 2674 cmpl(cnt1, stride); // Do not read beyond string 2675 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2676 // Back-up string to avoid reading beyond string. 2677 lea(result, Address(result, cnt1, scale1, -16)); 2678 movl(cnt1, stride); 2679 jmpb(SCAN_TO_SUBSTR); 2680 2681 // Found a potential substr 2682 bind(FOUND_CANDIDATE); 2683 // After pcmpestri tmp(rcx) contains matched element index 2684 2685 // Make sure string is still long enough 2686 subl(cnt1, tmp); 2687 cmpl(cnt1, cnt2); 2688 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 2689 // Left less then substring. 2690 2691 bind(RET_NOT_FOUND); 2692 movl(result, -1); 2693 jmp(CLEANUP); 2694 2695 bind(FOUND_SUBSTR); 2696 // Compute start addr of substr 2697 lea(result, Address(result, tmp, scale1)); 2698 if (int_cnt2 > 0) { // Constant substring 2699 // Repeat search for small substring (< 8 chars) 2700 // from new point without reloading substring. 2701 // Have to check that we don't read beyond string. 2702 cmpl(tmp, stride-int_cnt2); 2703 jccb(Assembler::greater, ADJUST_STR); 2704 // Fall through if matched whole substring. 2705 } else { // non constant 2706 assert(int_cnt2 == -1, "should be != 0"); 2707 2708 addl(tmp, cnt2); 2709 // Found result if we matched whole substring. 2710 cmpl(tmp, stride); 2711 jcc(Assembler::lessEqual, RET_FOUND); 2712 2713 // Repeat search for small substring (<= 8 chars) 2714 // from new point 'str1' without reloading substring. 2715 cmpl(cnt2, stride); 2716 // Have to check that we don't read beyond string. 2717 jccb(Assembler::lessEqual, ADJUST_STR); 2718 2719 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 2720 // Compare the rest of substring (> 8 chars). 2721 movptr(str1, result); 2722 2723 cmpl(tmp, cnt2); 2724 // First 8 chars are already matched. 2725 jccb(Assembler::equal, CHECK_NEXT); 2726 2727 bind(SCAN_SUBSTR); 2728 pcmpestri(vec, Address(str1, 0), mode); 2729 // Need to reload strings pointers if not matched whole vector 2730 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2731 2732 bind(CHECK_NEXT); 2733 subl(cnt2, stride); 2734 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 2735 addptr(str1, 16); 2736 if (ae == StrIntrinsicNode::UL) { 2737 addptr(str2, 8); 2738 } else { 2739 addptr(str2, 16); 2740 } 2741 subl(cnt1, stride); 2742 cmpl(cnt2, stride); // Do not read beyond substring 2743 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 2744 // Back-up strings to avoid reading beyond substring. 2745 2746 if (ae == StrIntrinsicNode::UL) { 2747 lea(str2, Address(str2, cnt2, scale2, -8)); 2748 lea(str1, Address(str1, cnt2, scale1, -16)); 2749 } else { 2750 lea(str2, Address(str2, cnt2, scale2, -16)); 2751 lea(str1, Address(str1, cnt2, scale1, -16)); 2752 } 2753 subl(cnt1, cnt2); 2754 movl(cnt2, stride); 2755 addl(cnt1, stride); 2756 bind(CONT_SCAN_SUBSTR); 2757 if (ae == StrIntrinsicNode::UL) { 2758 pmovzxbw(vec, Address(str2, 0)); 2759 } else { 2760 movdqu(vec, Address(str2, 0)); 2761 } 2762 jmp(SCAN_SUBSTR); 2763 2764 bind(RET_FOUND_LONG); 2765 movptr(str1, Address(rsp, wordSize)); 2766 } // non constant 2767 2768 bind(RET_FOUND); 2769 // Compute substr offset 2770 subptr(result, str1); 2771 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2772 shrl(result, 1); // index 2773 } 2774 bind(CLEANUP); 2775 pop(rsp); // restore SP 2776 2777 } // string_indexof 2778 2779 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 2780 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 2781 ShortBranchVerifier sbv(this); 2782 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2783 2784 int stride = 8; 2785 2786 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 2787 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 2788 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 2789 FOUND_SEQ_CHAR, DONE_LABEL; 2790 2791 movptr(result, str1); 2792 if (UseAVX >= 2) { 2793 cmpl(cnt1, stride); 2794 jcc(Assembler::less, SCAN_TO_CHAR); 2795 cmpl(cnt1, 2*stride); 2796 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 2797 movdl(vec1, ch); 2798 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 2799 vpxor(vec2, vec2); 2800 movl(tmp, cnt1); 2801 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 2802 andl(cnt1,0x0000000F); //tail count (in chars) 2803 2804 bind(SCAN_TO_16_CHAR_LOOP); 2805 vmovdqu(vec3, Address(result, 0)); 2806 vpcmpeqw(vec3, vec3, vec1, 1); 2807 vptest(vec2, vec3); 2808 jcc(Assembler::carryClear, FOUND_CHAR); 2809 addptr(result, 32); 2810 subl(tmp, 2*stride); 2811 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 2812 jmp(SCAN_TO_8_CHAR); 2813 bind(SCAN_TO_8_CHAR_INIT); 2814 movdl(vec1, ch); 2815 pshuflw(vec1, vec1, 0x00); 2816 pshufd(vec1, vec1, 0); 2817 pxor(vec2, vec2); 2818 } 2819 bind(SCAN_TO_8_CHAR); 2820 cmpl(cnt1, stride); 2821 jcc(Assembler::less, SCAN_TO_CHAR); 2822 if (UseAVX < 2) { 2823 movdl(vec1, ch); 2824 pshuflw(vec1, vec1, 0x00); 2825 pshufd(vec1, vec1, 0); 2826 pxor(vec2, vec2); 2827 } 2828 movl(tmp, cnt1); 2829 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 2830 andl(cnt1,0x00000007); //tail count (in chars) 2831 2832 bind(SCAN_TO_8_CHAR_LOOP); 2833 movdqu(vec3, Address(result, 0)); 2834 pcmpeqw(vec3, vec1); 2835 ptest(vec2, vec3); 2836 jcc(Assembler::carryClear, FOUND_CHAR); 2837 addptr(result, 16); 2838 subl(tmp, stride); 2839 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 2840 bind(SCAN_TO_CHAR); 2841 testl(cnt1, cnt1); 2842 jcc(Assembler::zero, RET_NOT_FOUND); 2843 bind(SCAN_TO_CHAR_LOOP); 2844 load_unsigned_short(tmp, Address(result, 0)); 2845 cmpl(ch, tmp); 2846 jccb(Assembler::equal, FOUND_SEQ_CHAR); 2847 addptr(result, 2); 2848 subl(cnt1, 1); 2849 jccb(Assembler::zero, RET_NOT_FOUND); 2850 jmp(SCAN_TO_CHAR_LOOP); 2851 2852 bind(RET_NOT_FOUND); 2853 movl(result, -1); 2854 jmpb(DONE_LABEL); 2855 2856 bind(FOUND_CHAR); 2857 if (UseAVX >= 2) { 2858 vpmovmskb(tmp, vec3); 2859 } else { 2860 pmovmskb(tmp, vec3); 2861 } 2862 bsfl(ch, tmp); 2863 addptr(result, ch); 2864 2865 bind(FOUND_SEQ_CHAR); 2866 subptr(result, str1); 2867 shrl(result, 1); 2868 2869 bind(DONE_LABEL); 2870 } // string_indexof_char 2871 2872 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 2873 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 2874 ShortBranchVerifier sbv(this); 2875 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2876 2877 int stride = 16; 2878 2879 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 2880 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 2881 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 2882 FOUND_SEQ_CHAR, DONE_LABEL; 2883 2884 movptr(result, str1); 2885 if (UseAVX >= 2) { 2886 cmpl(cnt1, stride); 2887 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 2888 cmpl(cnt1, stride*2); 2889 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 2890 movdl(vec1, ch); 2891 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 2892 vpxor(vec2, vec2); 2893 movl(tmp, cnt1); 2894 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 2895 andl(cnt1,0x0000001F); //tail count (in chars) 2896 2897 bind(SCAN_TO_32_CHAR_LOOP); 2898 vmovdqu(vec3, Address(result, 0)); 2899 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 2900 vptest(vec2, vec3); 2901 jcc(Assembler::carryClear, FOUND_CHAR); 2902 addptr(result, 32); 2903 subl(tmp, stride*2); 2904 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 2905 jmp(SCAN_TO_16_CHAR); 2906 2907 bind(SCAN_TO_16_CHAR_INIT); 2908 movdl(vec1, ch); 2909 pxor(vec2, vec2); 2910 pshufb(vec1, vec2); 2911 } 2912 2913 bind(SCAN_TO_16_CHAR); 2914 cmpl(cnt1, stride); 2915 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 2916 if (UseAVX < 2) { 2917 movdl(vec1, ch); 2918 pxor(vec2, vec2); 2919 pshufb(vec1, vec2); 2920 } 2921 movl(tmp, cnt1); 2922 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 2923 andl(cnt1,0x0000000F); //tail count (in bytes) 2924 2925 bind(SCAN_TO_16_CHAR_LOOP); 2926 movdqu(vec3, Address(result, 0)); 2927 pcmpeqb(vec3, vec1); 2928 ptest(vec2, vec3); 2929 jcc(Assembler::carryClear, FOUND_CHAR); 2930 addptr(result, 16); 2931 subl(tmp, stride); 2932 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 2933 2934 bind(SCAN_TO_CHAR_INIT); 2935 testl(cnt1, cnt1); 2936 jcc(Assembler::zero, RET_NOT_FOUND); 2937 bind(SCAN_TO_CHAR_LOOP); 2938 load_unsigned_byte(tmp, Address(result, 0)); 2939 cmpl(ch, tmp); 2940 jccb(Assembler::equal, FOUND_SEQ_CHAR); 2941 addptr(result, 1); 2942 subl(cnt1, 1); 2943 jccb(Assembler::zero, RET_NOT_FOUND); 2944 jmp(SCAN_TO_CHAR_LOOP); 2945 2946 bind(RET_NOT_FOUND); 2947 movl(result, -1); 2948 jmpb(DONE_LABEL); 2949 2950 bind(FOUND_CHAR); 2951 if (UseAVX >= 2) { 2952 vpmovmskb(tmp, vec3); 2953 } else { 2954 pmovmskb(tmp, vec3); 2955 } 2956 bsfl(ch, tmp); 2957 addptr(result, ch); 2958 2959 bind(FOUND_SEQ_CHAR); 2960 subptr(result, str1); 2961 2962 bind(DONE_LABEL); 2963 } // stringL_indexof_char 2964 2965 // helper function for string_compare 2966 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 2967 Address::ScaleFactor scale, Address::ScaleFactor scale1, 2968 Address::ScaleFactor scale2, Register index, int ae) { 2969 if (ae == StrIntrinsicNode::LL) { 2970 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 2971 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 2972 } else if (ae == StrIntrinsicNode::UU) { 2973 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 2974 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 2975 } else { 2976 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 2977 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 2978 } 2979 } 2980 2981 // Compare strings, used for char[] and byte[]. 2982 void C2_MacroAssembler::string_compare(Register str1, Register str2, 2983 Register cnt1, Register cnt2, Register result, 2984 XMMRegister vec1, int ae, KRegister mask) { 2985 ShortBranchVerifier sbv(this); 2986 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 2987 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 2988 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 2989 int stride2x2 = 0x40; 2990 Address::ScaleFactor scale = Address::no_scale; 2991 Address::ScaleFactor scale1 = Address::no_scale; 2992 Address::ScaleFactor scale2 = Address::no_scale; 2993 2994 if (ae != StrIntrinsicNode::LL) { 2995 stride2x2 = 0x20; 2996 } 2997 2998 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 2999 shrl(cnt2, 1); 3000 } 3001 // Compute the minimum of the string lengths and the 3002 // difference of the string lengths (stack). 3003 // Do the conditional move stuff 3004 movl(result, cnt1); 3005 subl(cnt1, cnt2); 3006 push(cnt1); 3007 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3008 3009 // Is the minimum length zero? 3010 testl(cnt2, cnt2); 3011 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3012 if (ae == StrIntrinsicNode::LL) { 3013 // Load first bytes 3014 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3015 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3016 } else if (ae == StrIntrinsicNode::UU) { 3017 // Load first characters 3018 load_unsigned_short(result, Address(str1, 0)); 3019 load_unsigned_short(cnt1, Address(str2, 0)); 3020 } else { 3021 load_unsigned_byte(result, Address(str1, 0)); 3022 load_unsigned_short(cnt1, Address(str2, 0)); 3023 } 3024 subl(result, cnt1); 3025 jcc(Assembler::notZero, POP_LABEL); 3026 3027 if (ae == StrIntrinsicNode::UU) { 3028 // Divide length by 2 to get number of chars 3029 shrl(cnt2, 1); 3030 } 3031 cmpl(cnt2, 1); 3032 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3033 3034 // Check if the strings start at the same location and setup scale and stride 3035 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3036 cmpptr(str1, str2); 3037 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3038 if (ae == StrIntrinsicNode::LL) { 3039 scale = Address::times_1; 3040 stride = 16; 3041 } else { 3042 scale = Address::times_2; 3043 stride = 8; 3044 } 3045 } else { 3046 scale1 = Address::times_1; 3047 scale2 = Address::times_2; 3048 // scale not used 3049 stride = 8; 3050 } 3051 3052 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3053 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3054 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3055 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3056 Label COMPARE_TAIL_LONG; 3057 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3058 3059 int pcmpmask = 0x19; 3060 if (ae == StrIntrinsicNode::LL) { 3061 pcmpmask &= ~0x01; 3062 } 3063 3064 // Setup to compare 16-chars (32-bytes) vectors, 3065 // start from first character again because it has aligned address. 3066 if (ae == StrIntrinsicNode::LL) { 3067 stride2 = 32; 3068 } else { 3069 stride2 = 16; 3070 } 3071 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3072 adr_stride = stride << scale; 3073 } else { 3074 adr_stride1 = 8; //stride << scale1; 3075 adr_stride2 = 16; //stride << scale2; 3076 } 3077 3078 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3079 // rax and rdx are used by pcmpestri as elements counters 3080 movl(result, cnt2); 3081 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3082 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3083 3084 // fast path : compare first 2 8-char vectors. 3085 bind(COMPARE_16_CHARS); 3086 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3087 movdqu(vec1, Address(str1, 0)); 3088 } else { 3089 pmovzxbw(vec1, Address(str1, 0)); 3090 } 3091 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3092 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3093 3094 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3095 movdqu(vec1, Address(str1, adr_stride)); 3096 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3097 } else { 3098 pmovzxbw(vec1, Address(str1, adr_stride1)); 3099 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3100 } 3101 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3102 addl(cnt1, stride); 3103 3104 // Compare the characters at index in cnt1 3105 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3106 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3107 subl(result, cnt2); 3108 jmp(POP_LABEL); 3109 3110 // Setup the registers to start vector comparison loop 3111 bind(COMPARE_WIDE_VECTORS); 3112 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3113 lea(str1, Address(str1, result, scale)); 3114 lea(str2, Address(str2, result, scale)); 3115 } else { 3116 lea(str1, Address(str1, result, scale1)); 3117 lea(str2, Address(str2, result, scale2)); 3118 } 3119 subl(result, stride2); 3120 subl(cnt2, stride2); 3121 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3122 negptr(result); 3123 3124 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3125 bind(COMPARE_WIDE_VECTORS_LOOP); 3126 3127 #ifdef _LP64 3128 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3129 cmpl(cnt2, stride2x2); 3130 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3131 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3132 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3133 3134 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3135 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3136 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3137 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3138 } else { 3139 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3140 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3141 } 3142 kortestql(mask, mask); 3143 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3144 addptr(result, stride2x2); // update since we already compared at this addr 3145 subl(cnt2, stride2x2); // and sub the size too 3146 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3147 3148 vpxor(vec1, vec1); 3149 jmpb(COMPARE_WIDE_TAIL); 3150 }//if (VM_Version::supports_avx512vlbw()) 3151 #endif // _LP64 3152 3153 3154 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3155 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3156 vmovdqu(vec1, Address(str1, result, scale)); 3157 vpxor(vec1, Address(str2, result, scale)); 3158 } else { 3159 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3160 vpxor(vec1, Address(str2, result, scale2)); 3161 } 3162 vptest(vec1, vec1); 3163 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3164 addptr(result, stride2); 3165 subl(cnt2, stride2); 3166 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3167 // clean upper bits of YMM registers 3168 vpxor(vec1, vec1); 3169 3170 // compare wide vectors tail 3171 bind(COMPARE_WIDE_TAIL); 3172 testptr(result, result); 3173 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3174 3175 movl(result, stride2); 3176 movl(cnt2, result); 3177 negptr(result); 3178 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3179 3180 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3181 bind(VECTOR_NOT_EQUAL); 3182 // clean upper bits of YMM registers 3183 vpxor(vec1, vec1); 3184 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3185 lea(str1, Address(str1, result, scale)); 3186 lea(str2, Address(str2, result, scale)); 3187 } else { 3188 lea(str1, Address(str1, result, scale1)); 3189 lea(str2, Address(str2, result, scale2)); 3190 } 3191 jmp(COMPARE_16_CHARS); 3192 3193 // Compare tail chars, length between 1 to 15 chars 3194 bind(COMPARE_TAIL_LONG); 3195 movl(cnt2, result); 3196 cmpl(cnt2, stride); 3197 jcc(Assembler::less, COMPARE_SMALL_STR); 3198 3199 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3200 movdqu(vec1, Address(str1, 0)); 3201 } else { 3202 pmovzxbw(vec1, Address(str1, 0)); 3203 } 3204 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3205 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3206 subptr(cnt2, stride); 3207 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3208 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3209 lea(str1, Address(str1, result, scale)); 3210 lea(str2, Address(str2, result, scale)); 3211 } else { 3212 lea(str1, Address(str1, result, scale1)); 3213 lea(str2, Address(str2, result, scale2)); 3214 } 3215 negptr(cnt2); 3216 jmpb(WHILE_HEAD_LABEL); 3217 3218 bind(COMPARE_SMALL_STR); 3219 } else if (UseSSE42Intrinsics) { 3220 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3221 int pcmpmask = 0x19; 3222 // Setup to compare 8-char (16-byte) vectors, 3223 // start from first character again because it has aligned address. 3224 movl(result, cnt2); 3225 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3226 if (ae == StrIntrinsicNode::LL) { 3227 pcmpmask &= ~0x01; 3228 } 3229 jcc(Assembler::zero, COMPARE_TAIL); 3230 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3231 lea(str1, Address(str1, result, scale)); 3232 lea(str2, Address(str2, result, scale)); 3233 } else { 3234 lea(str1, Address(str1, result, scale1)); 3235 lea(str2, Address(str2, result, scale2)); 3236 } 3237 negptr(result); 3238 3239 // pcmpestri 3240 // inputs: 3241 // vec1- substring 3242 // rax - negative string length (elements count) 3243 // mem - scanned string 3244 // rdx - string length (elements count) 3245 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3246 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3247 // outputs: 3248 // rcx - first mismatched element index 3249 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3250 3251 bind(COMPARE_WIDE_VECTORS); 3252 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3253 movdqu(vec1, Address(str1, result, scale)); 3254 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3255 } else { 3256 pmovzxbw(vec1, Address(str1, result, scale1)); 3257 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3258 } 3259 // After pcmpestri cnt1(rcx) contains mismatched element index 3260 3261 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3262 addptr(result, stride); 3263 subptr(cnt2, stride); 3264 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3265 3266 // compare wide vectors tail 3267 testptr(result, result); 3268 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3269 3270 movl(cnt2, stride); 3271 movl(result, stride); 3272 negptr(result); 3273 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3274 movdqu(vec1, Address(str1, result, scale)); 3275 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3276 } else { 3277 pmovzxbw(vec1, Address(str1, result, scale1)); 3278 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3279 } 3280 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3281 3282 // Mismatched characters in the vectors 3283 bind(VECTOR_NOT_EQUAL); 3284 addptr(cnt1, result); 3285 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3286 subl(result, cnt2); 3287 jmpb(POP_LABEL); 3288 3289 bind(COMPARE_TAIL); // limit is zero 3290 movl(cnt2, result); 3291 // Fallthru to tail compare 3292 } 3293 // Shift str2 and str1 to the end of the arrays, negate min 3294 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3295 lea(str1, Address(str1, cnt2, scale)); 3296 lea(str2, Address(str2, cnt2, scale)); 3297 } else { 3298 lea(str1, Address(str1, cnt2, scale1)); 3299 lea(str2, Address(str2, cnt2, scale2)); 3300 } 3301 decrementl(cnt2); // first character was compared already 3302 negptr(cnt2); 3303 3304 // Compare the rest of the elements 3305 bind(WHILE_HEAD_LABEL); 3306 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3307 subl(result, cnt1); 3308 jccb(Assembler::notZero, POP_LABEL); 3309 increment(cnt2); 3310 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3311 3312 // Strings are equal up to min length. Return the length difference. 3313 bind(LENGTH_DIFF_LABEL); 3314 pop(result); 3315 if (ae == StrIntrinsicNode::UU) { 3316 // Divide diff by 2 to get number of chars 3317 sarl(result, 1); 3318 } 3319 jmpb(DONE_LABEL); 3320 3321 #ifdef _LP64 3322 if (VM_Version::supports_avx512vlbw()) { 3323 3324 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3325 3326 kmovql(cnt1, mask); 3327 notq(cnt1); 3328 bsfq(cnt2, cnt1); 3329 if (ae != StrIntrinsicNode::LL) { 3330 // Divide diff by 2 to get number of chars 3331 sarl(cnt2, 1); 3332 } 3333 addq(result, cnt2); 3334 if (ae == StrIntrinsicNode::LL) { 3335 load_unsigned_byte(cnt1, Address(str2, result)); 3336 load_unsigned_byte(result, Address(str1, result)); 3337 } else if (ae == StrIntrinsicNode::UU) { 3338 load_unsigned_short(cnt1, Address(str2, result, scale)); 3339 load_unsigned_short(result, Address(str1, result, scale)); 3340 } else { 3341 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3342 load_unsigned_byte(result, Address(str1, result, scale1)); 3343 } 3344 subl(result, cnt1); 3345 jmpb(POP_LABEL); 3346 }//if (VM_Version::supports_avx512vlbw()) 3347 #endif // _LP64 3348 3349 // Discard the stored length difference 3350 bind(POP_LABEL); 3351 pop(cnt1); 3352 3353 // That's it 3354 bind(DONE_LABEL); 3355 if(ae == StrIntrinsicNode::UL) { 3356 negl(result); 3357 } 3358 3359 } 3360 3361 // Search for Non-ASCII character (Negative byte value) in a byte array, 3362 // return the index of the first such character, otherwise the length 3363 // of the array segment searched. 3364 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3365 // @IntrinsicCandidate 3366 // public static int countPositives(byte[] ba, int off, int len) { 3367 // for (int i = off; i < off + len; i++) { 3368 // if (ba[i] < 0) { 3369 // return i - off; 3370 // } 3371 // } 3372 // return len; 3373 // } 3374 void C2_MacroAssembler::count_positives(Register ary1, Register len, 3375 Register result, Register tmp1, 3376 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3377 // rsi: byte array 3378 // rcx: len 3379 // rax: result 3380 ShortBranchVerifier sbv(this); 3381 assert_different_registers(ary1, len, result, tmp1); 3382 assert_different_registers(vec1, vec2); 3383 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3384 3385 movl(result, len); // copy 3386 // len == 0 3387 testl(len, len); 3388 jcc(Assembler::zero, DONE); 3389 3390 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3391 VM_Version::supports_avx512vlbw() && 3392 VM_Version::supports_bmi2()) { 3393 3394 Label test_64_loop, test_tail, BREAK_LOOP; 3395 Register tmp3_aliased = len; 3396 3397 movl(tmp1, len); 3398 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3399 3400 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F 3401 andl(len, ~(64 - 1)); // vector count (in chars) 3402 jccb(Assembler::zero, test_tail); 3403 3404 lea(ary1, Address(ary1, len, Address::times_1)); 3405 negptr(len); 3406 3407 bind(test_64_loop); 3408 // Check whether our 64 elements of size byte contain negatives 3409 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3410 kortestql(mask1, mask1); 3411 jcc(Assembler::notZero, BREAK_LOOP); 3412 3413 addptr(len, 64); 3414 jccb(Assembler::notZero, test_64_loop); 3415 3416 bind(test_tail); 3417 // bail out when there is nothing to be done 3418 testl(tmp1, -1); 3419 jcc(Assembler::zero, DONE); 3420 3421 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3422 #ifdef _LP64 3423 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3424 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3425 notq(tmp3_aliased); 3426 kmovql(mask2, tmp3_aliased); 3427 #else 3428 Label k_init; 3429 jmp(k_init); 3430 3431 // We could not read 64-bits from a general purpose register thus we move 3432 // data required to compose 64 1's to the instruction stream 3433 // We emit 64 byte wide series of elements from 0..63 which later on would 3434 // be used as a compare targets with tail count contained in tmp1 register. 3435 // Result would be a k register having tmp1 consecutive number or 1 3436 // counting from least significant bit. 3437 address tmp = pc(); 3438 emit_int64(0x0706050403020100); 3439 emit_int64(0x0F0E0D0C0B0A0908); 3440 emit_int64(0x1716151413121110); 3441 emit_int64(0x1F1E1D1C1B1A1918); 3442 emit_int64(0x2726252423222120); 3443 emit_int64(0x2F2E2D2C2B2A2928); 3444 emit_int64(0x3736353433323130); 3445 emit_int64(0x3F3E3D3C3B3A3938); 3446 3447 bind(k_init); 3448 lea(len, InternalAddress(tmp)); 3449 // create mask to test for negative byte inside a vector 3450 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3451 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 3452 3453 #endif 3454 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3455 ktestq(mask1, mask2); 3456 jcc(Assembler::zero, DONE); 3457 3458 bind(BREAK_LOOP); 3459 // At least one byte in the last 64 bytes is negative. 3460 // Set up to look at the last 64 bytes as if they were a tail 3461 lea(ary1, Address(ary1, len, Address::times_1)); 3462 addptr(result, len); 3463 // Ignore the very last byte: if all others are positive, 3464 // it must be negative, so we can skip right to the 2+1 byte 3465 // end comparison at this point 3466 orl(result, 63); 3467 movl(len, 63); 3468 // Fallthru to tail compare 3469 } else { 3470 3471 if (UseAVX >= 2 && UseSSE >= 2) { 3472 // With AVX2, use 32-byte vector compare 3473 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 3474 3475 // Compare 32-byte vectors 3476 testl(len, 0xffffffe0); // vector count (in bytes) 3477 jccb(Assembler::zero, TAIL_START); 3478 3479 andl(len, 0xffffffe0); 3480 lea(ary1, Address(ary1, len, Address::times_1)); 3481 negptr(len); 3482 3483 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3484 movdl(vec2, tmp1); 3485 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 3486 3487 bind(COMPARE_WIDE_VECTORS); 3488 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 3489 vptest(vec1, vec2); 3490 jccb(Assembler::notZero, BREAK_LOOP); 3491 addptr(len, 32); 3492 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3493 3494 testl(result, 0x0000001f); // any bytes remaining? 3495 jcc(Assembler::zero, DONE); 3496 3497 // Quick test using the already prepared vector mask 3498 movl(len, result); 3499 andl(len, 0x0000001f); 3500 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 3501 vptest(vec1, vec2); 3502 jcc(Assembler::zero, DONE); 3503 // There are zeros, jump to the tail to determine exactly where 3504 jmpb(TAIL_START); 3505 3506 bind(BREAK_LOOP); 3507 // At least one byte in the last 32-byte vector is negative. 3508 // Set up to look at the last 32 bytes as if they were a tail 3509 lea(ary1, Address(ary1, len, Address::times_1)); 3510 addptr(result, len); 3511 // Ignore the very last byte: if all others are positive, 3512 // it must be negative, so we can skip right to the 2+1 byte 3513 // end comparison at this point 3514 orl(result, 31); 3515 movl(len, 31); 3516 // Fallthru to tail compare 3517 } else if (UseSSE42Intrinsics) { 3518 // With SSE4.2, use double quad vector compare 3519 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 3520 3521 // Compare 16-byte vectors 3522 testl(len, 0xfffffff0); // vector count (in bytes) 3523 jcc(Assembler::zero, TAIL_START); 3524 3525 andl(len, 0xfffffff0); 3526 lea(ary1, Address(ary1, len, Address::times_1)); 3527 negptr(len); 3528 3529 movl(tmp1, 0x80808080); 3530 movdl(vec2, tmp1); 3531 pshufd(vec2, vec2, 0); 3532 3533 bind(COMPARE_WIDE_VECTORS); 3534 movdqu(vec1, Address(ary1, len, Address::times_1)); 3535 ptest(vec1, vec2); 3536 jccb(Assembler::notZero, BREAK_LOOP); 3537 addptr(len, 16); 3538 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3539 3540 testl(result, 0x0000000f); // len is zero, any bytes remaining? 3541 jcc(Assembler::zero, DONE); 3542 3543 // Quick test using the already prepared vector mask 3544 movl(len, result); 3545 andl(len, 0x0000000f); // tail count (in bytes) 3546 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 3547 ptest(vec1, vec2); 3548 jcc(Assembler::zero, DONE); 3549 jmpb(TAIL_START); 3550 3551 bind(BREAK_LOOP); 3552 // At least one byte in the last 16-byte vector is negative. 3553 // Set up and look at the last 16 bytes as if they were a tail 3554 lea(ary1, Address(ary1, len, Address::times_1)); 3555 addptr(result, len); 3556 // Ignore the very last byte: if all others are positive, 3557 // it must be negative, so we can skip right to the 2+1 byte 3558 // end comparison at this point 3559 orl(result, 15); 3560 movl(len, 15); 3561 // Fallthru to tail compare 3562 } 3563 } 3564 3565 bind(TAIL_START); 3566 // Compare 4-byte vectors 3567 andl(len, 0xfffffffc); // vector count (in bytes) 3568 jccb(Assembler::zero, COMPARE_CHAR); 3569 3570 lea(ary1, Address(ary1, len, Address::times_1)); 3571 negptr(len); 3572 3573 bind(COMPARE_VECTORS); 3574 movl(tmp1, Address(ary1, len, Address::times_1)); 3575 andl(tmp1, 0x80808080); 3576 jccb(Assembler::notZero, TAIL_ADJUST); 3577 addptr(len, 4); 3578 jccb(Assembler::notZero, COMPARE_VECTORS); 3579 3580 // Compare trailing char (final 2-3 bytes), if any 3581 bind(COMPARE_CHAR); 3582 3583 testl(result, 0x2); // tail char 3584 jccb(Assembler::zero, COMPARE_BYTE); 3585 load_unsigned_short(tmp1, Address(ary1, 0)); 3586 andl(tmp1, 0x00008080); 3587 jccb(Assembler::notZero, CHAR_ADJUST); 3588 lea(ary1, Address(ary1, 2)); 3589 3590 bind(COMPARE_BYTE); 3591 testl(result, 0x1); // tail byte 3592 jccb(Assembler::zero, DONE); 3593 load_unsigned_byte(tmp1, Address(ary1, 0)); 3594 testl(tmp1, 0x00000080); 3595 jccb(Assembler::zero, DONE); 3596 subptr(result, 1); 3597 jmpb(DONE); 3598 3599 bind(TAIL_ADJUST); 3600 // there are negative bits in the last 4 byte block. 3601 // Adjust result and check the next three bytes 3602 addptr(result, len); 3603 orl(result, 3); 3604 lea(ary1, Address(ary1, len, Address::times_1)); 3605 jmpb(COMPARE_CHAR); 3606 3607 bind(CHAR_ADJUST); 3608 // We are looking at a char + optional byte tail, and found that one 3609 // of the bytes in the char is negative. Adjust the result, check the 3610 // first byte and readjust if needed. 3611 andl(result, 0xfffffffc); 3612 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 3613 jccb(Assembler::notZero, DONE); 3614 addptr(result, 1); 3615 3616 // That's it 3617 bind(DONE); 3618 if (UseAVX >= 2 && UseSSE >= 2) { 3619 // clean upper bits of YMM registers 3620 vpxor(vec1, vec1); 3621 vpxor(vec2, vec2); 3622 } 3623 } 3624 3625 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 3626 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 3627 Register limit, Register result, Register chr, 3628 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 3629 ShortBranchVerifier sbv(this); 3630 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 3631 3632 int length_offset = arrayOopDesc::length_offset_in_bytes(); 3633 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 3634 3635 if (is_array_equ) { 3636 // Check the input args 3637 cmpoop(ary1, ary2); 3638 jcc(Assembler::equal, TRUE_LABEL); 3639 3640 // Need additional checks for arrays_equals. 3641 testptr(ary1, ary1); 3642 jcc(Assembler::zero, FALSE_LABEL); 3643 testptr(ary2, ary2); 3644 jcc(Assembler::zero, FALSE_LABEL); 3645 3646 // Check the lengths 3647 movl(limit, Address(ary1, length_offset)); 3648 cmpl(limit, Address(ary2, length_offset)); 3649 jcc(Assembler::notEqual, FALSE_LABEL); 3650 } 3651 3652 // count == 0 3653 testl(limit, limit); 3654 jcc(Assembler::zero, TRUE_LABEL); 3655 3656 if (is_array_equ) { 3657 // Load array address 3658 lea(ary1, Address(ary1, base_offset)); 3659 lea(ary2, Address(ary2, base_offset)); 3660 } 3661 3662 if (is_array_equ && is_char) { 3663 // arrays_equals when used for char[]. 3664 shll(limit, 1); // byte count != 0 3665 } 3666 movl(result, limit); // copy 3667 3668 if (UseAVX >= 2) { 3669 // With AVX2, use 32-byte vector compare 3670 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3671 3672 // Compare 32-byte vectors 3673 andl(result, 0x0000001f); // tail count (in bytes) 3674 andl(limit, 0xffffffe0); // vector count (in bytes) 3675 jcc(Assembler::zero, COMPARE_TAIL); 3676 3677 lea(ary1, Address(ary1, limit, Address::times_1)); 3678 lea(ary2, Address(ary2, limit, Address::times_1)); 3679 negptr(limit); 3680 3681 #ifdef _LP64 3682 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3683 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 3684 3685 cmpl(limit, -64); 3686 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3687 3688 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3689 3690 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 3691 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 3692 kortestql(mask, mask); 3693 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3694 addptr(limit, 64); // update since we already compared at this addr 3695 cmpl(limit, -64); 3696 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3697 3698 // At this point we may still need to compare -limit+result bytes. 3699 // We could execute the next two instruction and just continue via non-wide path: 3700 // cmpl(limit, 0); 3701 // jcc(Assembler::equal, COMPARE_TAIL); // true 3702 // But since we stopped at the points ary{1,2}+limit which are 3703 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 3704 // (|limit| <= 32 and result < 32), 3705 // we may just compare the last 64 bytes. 3706 // 3707 addptr(result, -64); // it is safe, bc we just came from this area 3708 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 3709 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 3710 kortestql(mask, mask); 3711 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3712 3713 jmp(TRUE_LABEL); 3714 3715 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3716 3717 }//if (VM_Version::supports_avx512vlbw()) 3718 #endif //_LP64 3719 bind(COMPARE_WIDE_VECTORS); 3720 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 3721 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 3722 vpxor(vec1, vec2); 3723 3724 vptest(vec1, vec1); 3725 jcc(Assembler::notZero, FALSE_LABEL); 3726 addptr(limit, 32); 3727 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3728 3729 testl(result, result); 3730 jcc(Assembler::zero, TRUE_LABEL); 3731 3732 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 3733 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 3734 vpxor(vec1, vec2); 3735 3736 vptest(vec1, vec1); 3737 jccb(Assembler::notZero, FALSE_LABEL); 3738 jmpb(TRUE_LABEL); 3739 3740 bind(COMPARE_TAIL); // limit is zero 3741 movl(limit, result); 3742 // Fallthru to tail compare 3743 } else if (UseSSE42Intrinsics) { 3744 // With SSE4.2, use double quad vector compare 3745 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3746 3747 // Compare 16-byte vectors 3748 andl(result, 0x0000000f); // tail count (in bytes) 3749 andl(limit, 0xfffffff0); // vector count (in bytes) 3750 jcc(Assembler::zero, COMPARE_TAIL); 3751 3752 lea(ary1, Address(ary1, limit, Address::times_1)); 3753 lea(ary2, Address(ary2, limit, Address::times_1)); 3754 negptr(limit); 3755 3756 bind(COMPARE_WIDE_VECTORS); 3757 movdqu(vec1, Address(ary1, limit, Address::times_1)); 3758 movdqu(vec2, Address(ary2, limit, Address::times_1)); 3759 pxor(vec1, vec2); 3760 3761 ptest(vec1, vec1); 3762 jcc(Assembler::notZero, FALSE_LABEL); 3763 addptr(limit, 16); 3764 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3765 3766 testl(result, result); 3767 jcc(Assembler::zero, TRUE_LABEL); 3768 3769 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 3770 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 3771 pxor(vec1, vec2); 3772 3773 ptest(vec1, vec1); 3774 jccb(Assembler::notZero, FALSE_LABEL); 3775 jmpb(TRUE_LABEL); 3776 3777 bind(COMPARE_TAIL); // limit is zero 3778 movl(limit, result); 3779 // Fallthru to tail compare 3780 } 3781 3782 // Compare 4-byte vectors 3783 andl(limit, 0xfffffffc); // vector count (in bytes) 3784 jccb(Assembler::zero, COMPARE_CHAR); 3785 3786 lea(ary1, Address(ary1, limit, Address::times_1)); 3787 lea(ary2, Address(ary2, limit, Address::times_1)); 3788 negptr(limit); 3789 3790 bind(COMPARE_VECTORS); 3791 movl(chr, Address(ary1, limit, Address::times_1)); 3792 cmpl(chr, Address(ary2, limit, Address::times_1)); 3793 jccb(Assembler::notEqual, FALSE_LABEL); 3794 addptr(limit, 4); 3795 jcc(Assembler::notZero, COMPARE_VECTORS); 3796 3797 // Compare trailing char (final 2 bytes), if any 3798 bind(COMPARE_CHAR); 3799 testl(result, 0x2); // tail char 3800 jccb(Assembler::zero, COMPARE_BYTE); 3801 load_unsigned_short(chr, Address(ary1, 0)); 3802 load_unsigned_short(limit, Address(ary2, 0)); 3803 cmpl(chr, limit); 3804 jccb(Assembler::notEqual, FALSE_LABEL); 3805 3806 if (is_array_equ && is_char) { 3807 bind(COMPARE_BYTE); 3808 } else { 3809 lea(ary1, Address(ary1, 2)); 3810 lea(ary2, Address(ary2, 2)); 3811 3812 bind(COMPARE_BYTE); 3813 testl(result, 0x1); // tail byte 3814 jccb(Assembler::zero, TRUE_LABEL); 3815 load_unsigned_byte(chr, Address(ary1, 0)); 3816 load_unsigned_byte(limit, Address(ary2, 0)); 3817 cmpl(chr, limit); 3818 jccb(Assembler::notEqual, FALSE_LABEL); 3819 } 3820 bind(TRUE_LABEL); 3821 movl(result, 1); // return true 3822 jmpb(DONE); 3823 3824 bind(FALSE_LABEL); 3825 xorl(result, result); // return false 3826 3827 // That's it 3828 bind(DONE); 3829 if (UseAVX >= 2) { 3830 // clean upper bits of YMM registers 3831 vpxor(vec1, vec1); 3832 vpxor(vec2, vec2); 3833 } 3834 } 3835 3836 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 3837 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 3838 switch(ideal_opc) { 3839 case Op_LShiftVS: 3840 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 3841 case Op_LShiftVI: 3842 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 3843 case Op_LShiftVL: 3844 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 3845 case Op_RShiftVS: 3846 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 3847 case Op_RShiftVI: 3848 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 3849 case Op_RShiftVL: 3850 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 3851 case Op_URShiftVS: 3852 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 3853 case Op_URShiftVI: 3854 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 3855 case Op_URShiftVL: 3856 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 3857 case Op_RotateRightV: 3858 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 3859 case Op_RotateLeftV: 3860 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 3861 default: 3862 fatal("Unsupported masked operation"); break; 3863 } 3864 } 3865 3866 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 3867 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 3868 bool is_varshift) { 3869 switch (ideal_opc) { 3870 case Op_AddVB: 3871 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 3872 case Op_AddVS: 3873 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 3874 case Op_AddVI: 3875 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 3876 case Op_AddVL: 3877 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 3878 case Op_AddVF: 3879 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 3880 case Op_AddVD: 3881 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 3882 case Op_SubVB: 3883 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 3884 case Op_SubVS: 3885 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 3886 case Op_SubVI: 3887 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 3888 case Op_SubVL: 3889 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 3890 case Op_SubVF: 3891 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 3892 case Op_SubVD: 3893 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 3894 case Op_MulVS: 3895 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 3896 case Op_MulVI: 3897 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 3898 case Op_MulVL: 3899 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 3900 case Op_MulVF: 3901 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 3902 case Op_MulVD: 3903 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 3904 case Op_DivVF: 3905 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 3906 case Op_DivVD: 3907 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 3908 case Op_SqrtVF: 3909 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 3910 case Op_SqrtVD: 3911 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 3912 case Op_AbsVB: 3913 evpabsb(dst, mask, src2, merge, vlen_enc); break; 3914 case Op_AbsVS: 3915 evpabsw(dst, mask, src2, merge, vlen_enc); break; 3916 case Op_AbsVI: 3917 evpabsd(dst, mask, src2, merge, vlen_enc); break; 3918 case Op_AbsVL: 3919 evpabsq(dst, mask, src2, merge, vlen_enc); break; 3920 case Op_FmaVF: 3921 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 3922 case Op_FmaVD: 3923 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 3924 case Op_VectorRearrange: 3925 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 3926 case Op_LShiftVS: 3927 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 3928 case Op_LShiftVI: 3929 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 3930 case Op_LShiftVL: 3931 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 3932 case Op_RShiftVS: 3933 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 3934 case Op_RShiftVI: 3935 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 3936 case Op_RShiftVL: 3937 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 3938 case Op_URShiftVS: 3939 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 3940 case Op_URShiftVI: 3941 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 3942 case Op_URShiftVL: 3943 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 3944 case Op_RotateLeftV: 3945 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 3946 case Op_RotateRightV: 3947 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 3948 case Op_MaxV: 3949 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 3950 case Op_MinV: 3951 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 3952 case Op_XorV: 3953 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 3954 case Op_OrV: 3955 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 3956 case Op_AndV: 3957 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 3958 default: 3959 fatal("Unsupported masked operation"); break; 3960 } 3961 } 3962 3963 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 3964 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 3965 switch (ideal_opc) { 3966 case Op_AddVB: 3967 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 3968 case Op_AddVS: 3969 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 3970 case Op_AddVI: 3971 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 3972 case Op_AddVL: 3973 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 3974 case Op_AddVF: 3975 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 3976 case Op_AddVD: 3977 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 3978 case Op_SubVB: 3979 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 3980 case Op_SubVS: 3981 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 3982 case Op_SubVI: 3983 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 3984 case Op_SubVL: 3985 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 3986 case Op_SubVF: 3987 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 3988 case Op_SubVD: 3989 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 3990 case Op_MulVS: 3991 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 3992 case Op_MulVI: 3993 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 3994 case Op_MulVL: 3995 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 3996 case Op_MulVF: 3997 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 3998 case Op_MulVD: 3999 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4000 case Op_DivVF: 4001 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4002 case Op_DivVD: 4003 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4004 case Op_FmaVF: 4005 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4006 case Op_FmaVD: 4007 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4008 case Op_MaxV: 4009 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4010 case Op_MinV: 4011 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4012 case Op_XorV: 4013 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4014 case Op_OrV: 4015 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4016 case Op_AndV: 4017 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4018 default: 4019 fatal("Unsupported masked operation"); break; 4020 } 4021 } 4022 4023 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4024 KRegister src1, KRegister src2) { 4025 BasicType etype = T_ILLEGAL; 4026 switch(mask_len) { 4027 case 2: 4028 case 4: 4029 case 8: etype = T_BYTE; break; 4030 case 16: etype = T_SHORT; break; 4031 case 32: etype = T_INT; break; 4032 case 64: etype = T_LONG; break; 4033 default: fatal("Unsupported type"); break; 4034 } 4035 assert(etype != T_ILLEGAL, ""); 4036 switch(ideal_opc) { 4037 case Op_AndVMask: 4038 kand(etype, dst, src1, src2); break; 4039 case Op_OrVMask: 4040 kor(etype, dst, src1, src2); break; 4041 case Op_XorVMask: 4042 kxor(etype, dst, src1, src2); break; 4043 default: 4044 fatal("Unsupported masked operation"); break; 4045 } 4046 } 4047 4048 /* 4049 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4050 * If src is NaN, the result is 0. 4051 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4052 * the result is equal to the value of Integer.MIN_VALUE. 4053 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4054 * the result is equal to the value of Integer.MAX_VALUE. 4055 */ 4056 void C2_MacroAssembler::vector_cast_float_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4057 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4058 Register scratch, AddressLiteral float_sign_flip, 4059 int vec_enc) { 4060 Label done; 4061 vmovdqu(xtmp1, float_sign_flip, scratch, vec_enc); 4062 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4063 vptest(xtmp2, xtmp2, vec_enc); 4064 jccb(Assembler::equal, done); 4065 4066 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4067 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4068 4069 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4070 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4071 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4072 4073 // Recompute the mask for remaining special value. 4074 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4075 // Extract SRC values corresponding to TRUE mask lanes. 4076 vpand(xtmp4, xtmp2, src, vec_enc); 4077 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4078 // values are set. 4079 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4080 4081 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4082 bind(done); 4083 } 4084 4085 void C2_MacroAssembler::vector_cast_float_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4086 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4087 Register scratch, AddressLiteral float_sign_flip, 4088 int vec_enc) { 4089 Label done; 4090 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, scratch); 4091 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4092 kortestwl(ktmp1, ktmp1); 4093 jccb(Assembler::equal, done); 4094 4095 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4096 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4097 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4098 4099 kxorwl(ktmp1, ktmp1, ktmp2); 4100 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4101 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4102 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4103 bind(done); 4104 } 4105 4106 /* 4107 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4108 * If src is NaN, the result is 0. 4109 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4110 * the result is equal to the value of Long.MIN_VALUE. 4111 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4112 * the result is equal to the value of Long.MAX_VALUE. 4113 */ 4114 void C2_MacroAssembler::vector_cast_double_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4115 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4116 Register scratch, AddressLiteral double_sign_flip, 4117 int vec_enc) { 4118 Label done; 4119 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, scratch); 4120 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4121 kortestwl(ktmp1, ktmp1); 4122 jccb(Assembler::equal, done); 4123 4124 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4125 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4126 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4127 4128 kxorwl(ktmp1, ktmp1, ktmp2); 4129 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4130 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4131 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4132 bind(done); 4133 } 4134 4135 /* 4136 * Algorithm for vector D2L and F2I conversions:- 4137 * a) Perform vector D2L/F2I cast. 4138 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 4139 * It signifies that source value could be any of the special floating point 4140 * values(NaN,-Inf,Inf,Max,-Min). 4141 * c) Set destination to zero if source is NaN value. 4142 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 4143 */ 4144 4145 void C2_MacroAssembler::vector_castD2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4146 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 4147 Register scratch, int vec_enc) { 4148 evcvttpd2qq(dst, src, vec_enc); 4149 vector_cast_double_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, scratch, double_sign_flip, vec_enc); 4150 } 4151 4152 void C2_MacroAssembler::vector_castF2I_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4153 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4154 AddressLiteral float_sign_flip, Register scratch, int vec_enc) { 4155 vcvttps2dq(dst, src, vec_enc); 4156 vector_cast_float_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, scratch, float_sign_flip, vec_enc); 4157 } 4158 4159 void C2_MacroAssembler::vector_castF2I_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4160 KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 4161 Register scratch, int vec_enc) { 4162 vcvttps2dq(dst, src, vec_enc); 4163 vector_cast_float_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, scratch, float_sign_flip, vec_enc); 4164 } 4165 4166 #ifdef _LP64 4167 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4168 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 4169 AddressLiteral new_mxcsr, Register scratch, int vec_enc) { 4170 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4171 // and re-instantiate original MXCSR.RC mode after that. 4172 ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std()); 4173 ldmxcsr(new_mxcsr, scratch); 4174 mov64(scratch, julong_cast(0.5L)); 4175 evpbroadcastq(xtmp1, scratch, vec_enc); 4176 vaddpd(xtmp1, src , xtmp1, vec_enc); 4177 evcvtpd2qq(dst, xtmp1, vec_enc); 4178 vector_cast_double_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, scratch, double_sign_flip, vec_enc); 4179 ldmxcsr(mxcsr_std, scratch); 4180 } 4181 4182 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4183 KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 4184 AddressLiteral new_mxcsr, Register scratch, int vec_enc) { 4185 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4186 // and re-instantiate original MXCSR.RC mode after that. 4187 ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std()); 4188 ldmxcsr(new_mxcsr, scratch); 4189 movl(scratch, jint_cast(0.5)); 4190 movq(xtmp1, scratch); 4191 vbroadcastss(xtmp1, xtmp1, vec_enc); 4192 vaddps(xtmp1, src , xtmp1, vec_enc); 4193 vcvtps2dq(dst, xtmp1, vec_enc); 4194 vector_cast_float_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, scratch, float_sign_flip, vec_enc); 4195 ldmxcsr(mxcsr_std, scratch); 4196 } 4197 4198 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4199 XMMRegister xtmp3, XMMRegister xtmp4, AddressLiteral float_sign_flip, 4200 AddressLiteral new_mxcsr, Register scratch, int vec_enc) { 4201 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4202 // and re-instantiate original MXCSR.RC mode after that. 4203 ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std()); 4204 ldmxcsr(new_mxcsr, scratch); 4205 movl(scratch, jint_cast(0.5)); 4206 movq(xtmp1, scratch); 4207 vbroadcastss(xtmp1, xtmp1, vec_enc); 4208 vaddps(xtmp1, src , xtmp1, vec_enc); 4209 vcvtps2dq(dst, xtmp1, vec_enc); 4210 vector_cast_float_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, scratch, float_sign_flip, vec_enc); 4211 ldmxcsr(mxcsr_std, scratch); 4212 } 4213 #endif 4214 4215 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 4216 BasicType from_elem_bt, BasicType to_elem_bt) { 4217 switch (from_elem_bt) { 4218 case T_BYTE: 4219 switch (to_elem_bt) { 4220 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 4221 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 4222 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 4223 default: ShouldNotReachHere(); 4224 } 4225 break; 4226 case T_SHORT: 4227 switch (to_elem_bt) { 4228 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 4229 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 4230 default: ShouldNotReachHere(); 4231 } 4232 break; 4233 case T_INT: 4234 assert(to_elem_bt == T_LONG, ""); 4235 vpmovzxdq(dst, src, vlen_enc); 4236 break; 4237 default: 4238 ShouldNotReachHere(); 4239 } 4240 } 4241 4242 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 4243 bool merge, BasicType bt, int vlen_enc) { 4244 if (bt == T_INT) { 4245 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 4246 } else { 4247 assert(bt == T_LONG, ""); 4248 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 4249 } 4250 } 4251 4252 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 4253 bool merge, BasicType bt, int vlen_enc) { 4254 if (bt == T_INT) { 4255 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 4256 } else { 4257 assert(bt == T_LONG, ""); 4258 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 4259 } 4260 } 4261 4262 #ifdef _LP64 4263 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 4264 Register rtmp2, XMMRegister xtmp, int mask_len, 4265 int vec_enc) { 4266 int index = 0; 4267 int vindex = 0; 4268 mov64(rtmp1, 0x0101010101010101L); 4269 pdep(rtmp1, src, rtmp1); 4270 if (mask_len > 8) { 4271 movq(rtmp2, src); 4272 vpxor(xtmp, xtmp, xtmp, vec_enc); 4273 movq(xtmp, rtmp1); 4274 } 4275 movq(dst, rtmp1); 4276 4277 mask_len -= 8; 4278 while (mask_len > 0) { 4279 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 4280 index++; 4281 if ((index % 2) == 0) { 4282 pxor(xtmp, xtmp); 4283 } 4284 mov64(rtmp1, 0x0101010101010101L); 4285 shrq(rtmp2, 8); 4286 pdep(rtmp1, rtmp2, rtmp1); 4287 pinsrq(xtmp, rtmp1, index % 2); 4288 vindex = index / 2; 4289 if (vindex) { 4290 // Write entire 16 byte vector when both 64 bit 4291 // lanes are update to save redundant instructions. 4292 if (index % 2) { 4293 vinsertf128(dst, dst, xtmp, vindex); 4294 } 4295 } else { 4296 vmovdqu(dst, xtmp); 4297 } 4298 mask_len -= 8; 4299 } 4300 } 4301 4302 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 4303 switch(opc) { 4304 case Op_VectorMaskTrueCount: 4305 popcntq(dst, tmp); 4306 break; 4307 case Op_VectorMaskLastTrue: 4308 if (VM_Version::supports_lzcnt()) { 4309 lzcntq(tmp, tmp); 4310 movl(dst, 63); 4311 subl(dst, tmp); 4312 } else { 4313 movl(dst, -1); 4314 bsrq(tmp, tmp); 4315 cmov32(Assembler::notZero, dst, tmp); 4316 } 4317 break; 4318 case Op_VectorMaskFirstTrue: 4319 if (VM_Version::supports_bmi1()) { 4320 if (masklen < 32) { 4321 orl(tmp, 1 << masklen); 4322 tzcntl(dst, tmp); 4323 } else if (masklen == 32) { 4324 tzcntl(dst, tmp); 4325 } else { 4326 assert(masklen == 64, ""); 4327 tzcntq(dst, tmp); 4328 } 4329 } else { 4330 if (masklen < 32) { 4331 orl(tmp, 1 << masklen); 4332 bsfl(dst, tmp); 4333 } else { 4334 assert(masklen == 32 || masklen == 64, ""); 4335 movl(dst, masklen); 4336 if (masklen == 32) { 4337 bsfl(tmp, tmp); 4338 } else { 4339 bsfq(tmp, tmp); 4340 } 4341 cmov32(Assembler::notZero, dst, tmp); 4342 } 4343 } 4344 break; 4345 case Op_VectorMaskToLong: 4346 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 4347 break; 4348 default: assert(false, "Unhandled mask operation"); 4349 } 4350 } 4351 4352 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 4353 int masklen, int masksize, int vec_enc) { 4354 assert(VM_Version::supports_popcnt(), ""); 4355 4356 if(VM_Version::supports_avx512bw()) { 4357 kmovql(tmp, mask); 4358 } else { 4359 assert(masklen <= 16, ""); 4360 kmovwl(tmp, mask); 4361 } 4362 4363 // Mask generated out of partial vector comparisons/replicate/mask manipulation 4364 // operations needs to be clipped. 4365 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 4366 andq(tmp, (1 << masklen) - 1); 4367 } 4368 4369 vector_mask_operation_helper(opc, dst, tmp, masklen); 4370 } 4371 4372 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 4373 Register tmp, int masklen, BasicType bt, int vec_enc) { 4374 assert(vec_enc == AVX_128bit && VM_Version::supports_avx() || 4375 vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), ""); 4376 assert(VM_Version::supports_popcnt(), ""); 4377 4378 bool need_clip = false; 4379 switch(bt) { 4380 case T_BOOLEAN: 4381 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 4382 vpxor(xtmp, xtmp, xtmp, vec_enc); 4383 vpsubb(xtmp, xtmp, mask, vec_enc); 4384 vpmovmskb(tmp, xtmp, vec_enc); 4385 need_clip = masklen < 16; 4386 break; 4387 case T_BYTE: 4388 vpmovmskb(tmp, mask, vec_enc); 4389 need_clip = masklen < 16; 4390 break; 4391 case T_SHORT: 4392 vpacksswb(xtmp, mask, mask, vec_enc); 4393 if (masklen >= 16) { 4394 vpermpd(xtmp, xtmp, 8, vec_enc); 4395 } 4396 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 4397 need_clip = masklen < 16; 4398 break; 4399 case T_INT: 4400 case T_FLOAT: 4401 vmovmskps(tmp, mask, vec_enc); 4402 need_clip = masklen < 4; 4403 break; 4404 case T_LONG: 4405 case T_DOUBLE: 4406 vmovmskpd(tmp, mask, vec_enc); 4407 need_clip = masklen < 2; 4408 break; 4409 default: assert(false, "Unhandled type, %s", type2name(bt)); 4410 } 4411 4412 // Mask generated out of partial vector comparisons/replicate/mask manipulation 4413 // operations needs to be clipped. 4414 if (need_clip && opc != Op_VectorMaskFirstTrue) { 4415 // need_clip implies masklen < 32 4416 andq(tmp, (1 << masklen) - 1); 4417 } 4418 4419 vector_mask_operation_helper(opc, dst, tmp, masklen); 4420 } 4421 4422 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 4423 Register rtmp2, int mask_len) { 4424 kmov(rtmp1, src); 4425 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 4426 mov64(rtmp2, -1L); 4427 pext(rtmp2, rtmp2, rtmp1); 4428 kmov(dst, rtmp2); 4429 } 4430 4431 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 4432 bool merge, BasicType bt, int vec_enc) { 4433 if (opcode == Op_CompressV) { 4434 switch(bt) { 4435 case T_BYTE: 4436 evpcompressb(dst, mask, src, merge, vec_enc); 4437 break; 4438 case T_CHAR: 4439 case T_SHORT: 4440 evpcompressw(dst, mask, src, merge, vec_enc); 4441 break; 4442 case T_INT: 4443 evpcompressd(dst, mask, src, merge, vec_enc); 4444 break; 4445 case T_FLOAT: 4446 evcompressps(dst, mask, src, merge, vec_enc); 4447 break; 4448 case T_LONG: 4449 evpcompressq(dst, mask, src, merge, vec_enc); 4450 break; 4451 case T_DOUBLE: 4452 evcompresspd(dst, mask, src, merge, vec_enc); 4453 break; 4454 default: 4455 fatal("Unsupported type"); 4456 break; 4457 } 4458 } else { 4459 assert(opcode == Op_ExpandV, ""); 4460 switch(bt) { 4461 case T_BYTE: 4462 evpexpandb(dst, mask, src, merge, vec_enc); 4463 break; 4464 case T_CHAR: 4465 case T_SHORT: 4466 evpexpandw(dst, mask, src, merge, vec_enc); 4467 break; 4468 case T_INT: 4469 evpexpandd(dst, mask, src, merge, vec_enc); 4470 break; 4471 case T_FLOAT: 4472 evexpandps(dst, mask, src, merge, vec_enc); 4473 break; 4474 case T_LONG: 4475 evpexpandq(dst, mask, src, merge, vec_enc); 4476 break; 4477 case T_DOUBLE: 4478 evexpandpd(dst, mask, src, merge, vec_enc); 4479 break; 4480 default: 4481 fatal("Unsupported type"); 4482 break; 4483 } 4484 } 4485 } 4486 #endif 4487 4488 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 4489 if (VM_Version::supports_avx512bw()) { 4490 if (mask_len > 32) { 4491 kmovql(dst, src); 4492 } else { 4493 kmovdl(dst, src); 4494 if (mask_len != 32) { 4495 kshiftrdl(dst, dst, 32 - mask_len); 4496 } 4497 } 4498 } else { 4499 assert(mask_len <= 16, ""); 4500 kmovwl(dst, src); 4501 if (mask_len != 16) { 4502 kshiftrwl(dst, dst, 16 - mask_len); 4503 } 4504 } 4505 } 4506 4507 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 4508 int lane_size = type2aelembytes(bt); 4509 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 4510 if ((is_LP64 || lane_size < 8) && 4511 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 4512 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 4513 movptr(rtmp, imm32); 4514 switch(lane_size) { 4515 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 4516 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 4517 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 4518 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 4519 default : ShouldNotReachHere(); break; 4520 } 4521 } else { 4522 movptr(rtmp, imm32); 4523 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 4524 switch(lane_size) { 4525 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 4526 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 4527 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 4528 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 4529 default : ShouldNotReachHere(); break; 4530 } 4531 } 4532 } 4533 4534 // 4535 // Following is lookup table based popcount computation algorithm:- 4536 // Index Bit set count 4537 // [ 0000 -> 0, 4538 // 0001 -> 1, 4539 // 0010 -> 1, 4540 // 0011 -> 2, 4541 // 0100 -> 1, 4542 // 0101 -> 2, 4543 // 0110 -> 2, 4544 // 0111 -> 3, 4545 // 1000 -> 1, 4546 // 1001 -> 2, 4547 // 1010 -> 3, 4548 // 1011 -> 3, 4549 // 1100 -> 2, 4550 // 1101 -> 3, 4551 // 1111 -> 4 ] 4552 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 4553 // shuffle indices for lookup table access. 4554 // b. Right shift each byte of vector lane by 4 positions. 4555 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 4556 // shuffle indices for lookup table access. 4557 // d. Add the bitset count of upper and lower 4 bits of each byte. 4558 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 4559 // count of all the bytes of a quadword. 4560 // f. Perform step e. for upper 128bit vector lane. 4561 // g. Pack the bitset count of quadwords back to double word. 4562 // h. Unpacking and packing operations are not needed for 64bit vector lane. 4563 4564 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4565 XMMRegister xtmp2, Register rtmp, int vec_enc) { 4566 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 4567 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 4568 vpsrlw(dst, src, 4, vec_enc); 4569 vpand(dst, dst, xtmp1, vec_enc); 4570 vpand(xtmp1, src, xtmp1, vec_enc); 4571 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), rtmp, vec_enc); 4572 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 4573 vpshufb(dst, xtmp2, dst, vec_enc); 4574 vpaddb(dst, dst, xtmp1, vec_enc); 4575 } 4576 4577 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4578 XMMRegister xtmp2, Register rtmp, int vec_enc) { 4579 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 4580 // Following code is as per steps e,f,g and h of above algorithm. 4581 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4582 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 4583 vpsadbw(dst, dst, xtmp2, vec_enc); 4584 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 4585 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 4586 vpackuswb(dst, xtmp1, dst, vec_enc); 4587 } 4588 4589 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4590 XMMRegister xtmp2, Register rtmp, int vec_enc) { 4591 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 4592 // Add the popcount of upper and lower bytes of word. 4593 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 4594 vpsrlw(dst, xtmp1, 8, vec_enc); 4595 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 4596 vpaddw(dst, dst, xtmp1, vec_enc); 4597 } 4598 4599 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4600 XMMRegister xtmp2, Register rtmp, int vec_enc) { 4601 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 4602 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4603 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 4604 } 4605 4606 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4607 XMMRegister xtmp2, Register rtmp, int vec_enc) { 4608 switch(bt) { 4609 case T_LONG: 4610 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 4611 break; 4612 case T_INT: 4613 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 4614 break; 4615 case T_CHAR: 4616 case T_SHORT: 4617 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 4618 break; 4619 case T_BYTE: 4620 case T_BOOLEAN: 4621 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 4622 break; 4623 default: 4624 ShouldNotReachHere(); 4625 } 4626 } 4627 4628 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 4629 KRegister mask, bool merge, int vec_enc) { 4630 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 4631 switch(bt) { 4632 case T_LONG: 4633 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 4634 evpopcntq(dst, mask, src, merge, vec_enc); 4635 break; 4636 case T_INT: 4637 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 4638 evpopcntd(dst, mask, src, merge, vec_enc); 4639 break; 4640 case T_CHAR: 4641 case T_SHORT: 4642 assert(VM_Version::supports_avx512_bitalg(), ""); 4643 evpopcntw(dst, mask, src, merge, vec_enc); 4644 break; 4645 case T_BYTE: 4646 case T_BOOLEAN: 4647 assert(VM_Version::supports_avx512_bitalg(), ""); 4648 evpopcntb(dst, mask, src, merge, vec_enc); 4649 break; 4650 default: 4651 ShouldNotReachHere(); 4652 } 4653 } 4654 4655 #ifndef _LP64 4656 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 4657 assert(VM_Version::supports_avx512bw(), ""); 4658 kmovdl(tmp, src); 4659 kunpckdql(dst, tmp, tmp); 4660 } 4661 #endif 4662 4663 // Bit reversal algorithm first reverses the bits of each byte followed by 4664 // a byte level reversal for multi-byte primitive types (short/int/long). 4665 // Algorithm performs a lookup table access to get reverse bit sequence 4666 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 4667 // is obtained by swapping the reverse bit sequences of upper and lower 4668 // nibble of a byte. 4669 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4670 XMMRegister xtmp2, Register rtmp, int vec_enc) { 4671 if (VM_Version::supports_avx512vlbw()) { 4672 4673 // Get the reverse bit sequence of lower nibble of each byte. 4674 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), rtmp, vec_enc); 4675 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 4676 vpandq(dst, xtmp2, src, vec_enc); 4677 vpshufb(dst, xtmp1, dst, vec_enc); 4678 vpsllq(dst, dst, 4, vec_enc); 4679 4680 // Get the reverse bit sequence of upper nibble of each byte. 4681 vpandn(xtmp2, xtmp2, src, vec_enc); 4682 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 4683 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 4684 4685 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 4686 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 4687 vporq(xtmp2, dst, xtmp2, vec_enc); 4688 vector_reverse_byte(bt, dst, xtmp2, rtmp, vec_enc); 4689 4690 } else if(!VM_Version::supports_avx512vlbw() && vec_enc == Assembler::AVX_512bit) { 4691 4692 // Shift based bit reversal. 4693 assert(bt == T_LONG || bt == T_INT, ""); 4694 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 4695 4696 // Swap lower and upper nibble of each byte. 4697 vpandq(dst, xtmp1, src, vec_enc); 4698 vpsllq(dst, dst, 4, vec_enc); 4699 vpandn(xtmp2, xtmp1, src, vec_enc); 4700 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 4701 vporq(xtmp1, dst, xtmp2, vec_enc); 4702 4703 // Swap two least and most significant bits of each nibble. 4704 vbroadcast(T_INT, xtmp2, 0x33333333, rtmp, vec_enc); 4705 vpandq(dst, xtmp2, xtmp1, vec_enc); 4706 vpsllq(dst, dst, 2, vec_enc); 4707 vpandn(xtmp2, xtmp2, xtmp1, vec_enc); 4708 vpsrlq(xtmp2, xtmp2, 2, vec_enc); 4709 vporq(xtmp1, dst, xtmp2, vec_enc); 4710 4711 // Swap adjacent pair of bits. 4712 vbroadcast(T_INT, xtmp2, 0x55555555, rtmp, vec_enc); 4713 vpandq(dst, xtmp2, xtmp1, vec_enc); 4714 vpsllq(dst, dst, 1, vec_enc); 4715 vpandn(xtmp2, xtmp2, xtmp1, vec_enc); 4716 vpsrlq(xtmp2, xtmp2, 1, vec_enc); 4717 vporq(xtmp1, dst, xtmp2, vec_enc); 4718 4719 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 4720 4721 } else { 4722 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), rtmp, vec_enc); 4723 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 4724 4725 // Get the reverse bit sequence of lower nibble of each byte. 4726 vpand(dst, xtmp2, src, vec_enc); 4727 vpshufb(dst, xtmp1, dst, vec_enc); 4728 vpsllq(dst, dst, 4, vec_enc); 4729 4730 // Get the reverse bit sequence of upper nibble of each byte. 4731 vpandn(xtmp2, xtmp2, src, vec_enc); 4732 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 4733 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 4734 4735 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 4736 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 4737 vpor(xtmp2, dst, xtmp2, vec_enc); 4738 vector_reverse_byte(bt, dst, xtmp2, rtmp, vec_enc); 4739 } 4740 } 4741 4742 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, 4743 XMMRegister xtmp, AddressLiteral mask, Register rtmp, int vec_enc) { 4744 // Galois field instruction based bit reversal based on following algorithm. 4745 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 4746 assert(VM_Version::supports_gfni(), ""); 4747 vpbroadcastq(xtmp, mask, vec_enc, rtmp); 4748 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 4749 vector_reverse_byte(bt, dst, xtmp, rtmp, vec_enc); 4750 } 4751 4752 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4753 XMMRegister xtmp2, Register rtmp, int vec_enc) { 4754 // Shift based bit reversal. 4755 assert(VM_Version::supports_evex(), ""); 4756 evmovdqul(xtmp1, k0, src, true, vec_enc); 4757 switch(bt) { 4758 case T_LONG: 4759 // Swap upper and lower double word of each quad word. 4760 evprorq(xtmp1, k0, xtmp1, 32, true, vec_enc); 4761 case T_INT: 4762 // Swap upper and lower word of each double word. 4763 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 4764 case T_SHORT: 4765 // Swap upper and lower byte of each word. 4766 vbroadcast(T_INT, dst, 0x00FF00FF, rtmp, vec_enc); 4767 vpandq(xtmp2, dst, xtmp1, vec_enc); 4768 vpsllq(xtmp2, xtmp2, 8, vec_enc); 4769 vpandn(xtmp1, dst, xtmp1, vec_enc); 4770 vpsrlq(dst, xtmp1, 8, vec_enc); 4771 vporq(dst, dst, xtmp2, vec_enc); 4772 break; 4773 case T_BYTE: 4774 evmovdquq(dst, k0, src, true, vec_enc); 4775 break; 4776 default: 4777 fatal("Unsupported type"); 4778 break; 4779 } 4780 } 4781 4782 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, Register rtmp, int vec_enc) { 4783 if (bt == T_BYTE) { 4784 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 4785 evmovdquq(dst, k0, src, true, vec_enc); 4786 } else { 4787 vmovdqu(dst, src); 4788 } 4789 return; 4790 } 4791 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 4792 // pre-computed shuffle indices. 4793 switch(bt) { 4794 case T_LONG: 4795 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), rtmp, vec_enc); 4796 break; 4797 case T_INT: 4798 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), rtmp, vec_enc); 4799 break; 4800 case T_SHORT: 4801 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), rtmp, vec_enc); 4802 break; 4803 default: 4804 fatal("Unsupported type"); 4805 break; 4806 } 4807 vpshufb(dst, src, dst, vec_enc); 4808 } 4809 4810 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 4811 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 4812 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 4813 assert(is_integral_type(bt), ""); 4814 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 4815 assert(VM_Version::supports_avx512cd(), ""); 4816 switch(bt) { 4817 case T_LONG: 4818 evplzcntq(dst, ktmp, src, merge, vec_enc); 4819 break; 4820 case T_INT: 4821 evplzcntd(dst, ktmp, src, merge, vec_enc); 4822 break; 4823 case T_SHORT: 4824 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 4825 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 4826 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 4827 vpunpckhwd(dst, xtmp1, src, vec_enc); 4828 evplzcntd(dst, ktmp, dst, merge, vec_enc); 4829 vpackusdw(dst, xtmp2, dst, vec_enc); 4830 break; 4831 case T_BYTE: 4832 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 4833 // accessing the lookup table. 4834 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 4835 // accessing the lookup table. 4836 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 4837 assert(VM_Version::supports_avx512bw(), ""); 4838 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 4839 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 4840 vpand(xtmp2, dst, src, vec_enc); 4841 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 4842 vpsrlw(xtmp3, src, 4, vec_enc); 4843 vpand(xtmp3, dst, xtmp3, vec_enc); 4844 vpshufb(dst, xtmp1, xtmp3, vec_enc); 4845 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 4846 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 4847 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 4848 break; 4849 default: 4850 ShouldNotReachHere(); 4851 } 4852 } 4853 4854 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4855 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 4856 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 4857 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 4858 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 4859 // accessing the lookup table. 4860 vpand(dst, xtmp2, src, vec_enc); 4861 vpshufb(dst, xtmp1, dst, vec_enc); 4862 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 4863 // accessing the lookup table. 4864 vpsrlw(xtmp3, src, 4, vec_enc); 4865 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 4866 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 4867 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 4868 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 4869 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 4870 vpaddb(dst, dst, xtmp2, vec_enc); 4871 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 4872 } 4873 4874 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4875 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 4876 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 4877 // Add zero counts of lower byte and upper byte of a word if 4878 // upper byte holds a zero value. 4879 vpsrlw(xtmp3, src, 8, vec_enc); 4880 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 4881 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 4882 vpsllw(xtmp2, dst, 8, vec_enc); 4883 vpaddw(xtmp2, xtmp2, dst, vec_enc); 4884 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 4885 vpsrlw(dst, dst, 8, vec_enc); 4886 } 4887 4888 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4889 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 4890 // Since IEEE 754 floating point format represents mantissa in 1.0 format 4891 // hence biased exponent can be used to compute leading zero count as per 4892 // following formula:- 4893 // LZCNT = 32 - (biased_exp - 127) 4894 // Special handling has been introduced for Zero, Max_Int and -ve source values. 4895 4896 // Broadcast 0xFF 4897 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 4898 vpsrld(xtmp1, xtmp1, 24, vec_enc); 4899 4900 // Extract biased exponent. 4901 vcvtdq2ps(dst, src, vec_enc); 4902 vpsrld(dst, dst, 23, vec_enc); 4903 vpand(dst, dst, xtmp1, vec_enc); 4904 4905 // Broadcast 127. 4906 vpsrld(xtmp1, xtmp1, 1, vec_enc); 4907 // Exponent = biased_exp - 127 4908 vpsubd(dst, dst, xtmp1, vec_enc); 4909 4910 // Exponent = Exponent + 1 4911 vpsrld(xtmp3, xtmp1, 6, vec_enc); 4912 vpaddd(dst, dst, xtmp3, vec_enc); 4913 4914 // Replace -ve exponent with zero, exponent is -ve when src 4915 // lane contains a zero value. 4916 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4917 vblendvps(dst, dst, xtmp2, dst, vec_enc); 4918 4919 // Rematerialize broadcast 32. 4920 vpslld(xtmp1, xtmp3, 5, vec_enc); 4921 // Exponent is 32 if corresponding source lane contains max_int value. 4922 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4923 // LZCNT = 32 - exponent 4924 vpsubd(dst, xtmp1, dst, vec_enc); 4925 4926 // Replace LZCNT with a value 1 if corresponding source lane 4927 // contains max_int value. 4928 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 4929 4930 // Replace biased_exp with 0 if source lane value is less than zero. 4931 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4932 vblendvps(dst, dst, xtmp2, src, vec_enc); 4933 } 4934 4935 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4936 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 4937 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 4938 // Add zero counts of lower word and upper word of a double word if 4939 // upper word holds a zero value. 4940 vpsrld(xtmp3, src, 16, vec_enc); 4941 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 4942 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 4943 vpslld(xtmp2, dst, 16, vec_enc); 4944 vpaddd(xtmp2, xtmp2, dst, vec_enc); 4945 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 4946 vpsrld(dst, dst, 16, vec_enc); 4947 // Add zero counts of lower doubleword and upper doubleword of a 4948 // quadword if upper doubleword holds a zero value. 4949 vpsrlq(xtmp3, src, 32, vec_enc); 4950 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 4951 vpsllq(xtmp2, dst, 32, vec_enc); 4952 vpaddq(xtmp2, xtmp2, dst, vec_enc); 4953 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 4954 vpsrlq(dst, dst, 32, vec_enc); 4955 } 4956 4957 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 4958 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 4959 Register rtmp, int vec_enc) { 4960 assert(is_integral_type(bt), "unexpected type"); 4961 assert(vec_enc < Assembler::AVX_512bit, ""); 4962 switch(bt) { 4963 case T_LONG: 4964 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 4965 break; 4966 case T_INT: 4967 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 4968 break; 4969 case T_SHORT: 4970 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 4971 break; 4972 case T_BYTE: 4973 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 4974 break; 4975 default: 4976 ShouldNotReachHere(); 4977 } 4978 } 4979 4980 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 4981 switch(bt) { 4982 case T_BYTE: 4983 vpsubb(dst, src1, src2, vec_enc); 4984 break; 4985 case T_SHORT: 4986 vpsubw(dst, src1, src2, vec_enc); 4987 break; 4988 case T_INT: 4989 vpsubd(dst, src1, src2, vec_enc); 4990 break; 4991 case T_LONG: 4992 vpsubq(dst, src1, src2, vec_enc); 4993 break; 4994 default: 4995 ShouldNotReachHere(); 4996 } 4997 } 4998 4999 void C2_MacroAssembler::vpadd(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 5000 switch(bt) { 5001 case T_BYTE: 5002 vpaddb(dst, src1, src2, vec_enc); 5003 break; 5004 case T_SHORT: 5005 vpaddw(dst, src1, src2, vec_enc); 5006 break; 5007 case T_INT: 5008 vpaddd(dst, src1, src2, vec_enc); 5009 break; 5010 case T_LONG: 5011 vpaddq(dst, src1, src2, vec_enc); 5012 break; 5013 default: 5014 ShouldNotReachHere(); 5015 } 5016 } 5017 5018 // Trailing zero count computation is based on leading zero count operation as per 5019 // following equation. All AVX3 targets support AVX512CD feature which offers 5020 // direct vector instruction to compute leading zero count. 5021 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 5022 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5023 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5024 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 5025 assert(is_integral_type(bt), ""); 5026 // xtmp = -1 5027 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 5028 // xtmp = xtmp + src 5029 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 5030 // xtmp = xtmp & ~src 5031 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 5032 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 5033 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 5034 vpsub(bt, dst, xtmp4, dst, vec_enc); 5035 } 5036 5037 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 5038 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 5039 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5040 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5041 assert(is_integral_type(bt), ""); 5042 // xtmp = 0 5043 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 5044 // xtmp = 0 - src 5045 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 5046 // xtmp = xtmp | src 5047 vpor(xtmp3, xtmp3, src, vec_enc); 5048 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 5049 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 5050 vpsub(bt, dst, xtmp1, dst, vec_enc); 5051 } 5052 5053 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 5054 Label done; 5055 Label neg_divisor_fastpath; 5056 cmpl(divisor, 0); 5057 jccb(Assembler::less, neg_divisor_fastpath); 5058 xorl(rdx, rdx); 5059 divl(divisor); 5060 jmpb(done); 5061 bind(neg_divisor_fastpath); 5062 // Fastpath for divisor < 0: 5063 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 5064 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 5065 movl(rdx, rax); 5066 subl(rdx, divisor); 5067 if (VM_Version::supports_bmi1()) { 5068 andnl(rax, rdx, rax); 5069 } else { 5070 notl(rdx); 5071 andl(rax, rdx); 5072 } 5073 shrl(rax, 31); 5074 bind(done); 5075 } 5076 5077 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 5078 Label done; 5079 Label neg_divisor_fastpath; 5080 cmpl(divisor, 0); 5081 jccb(Assembler::less, neg_divisor_fastpath); 5082 xorl(rdx, rdx); 5083 divl(divisor); 5084 jmpb(done); 5085 bind(neg_divisor_fastpath); 5086 // Fastpath when divisor < 0: 5087 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 5088 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 5089 movl(rdx, rax); 5090 subl(rax, divisor); 5091 if (VM_Version::supports_bmi1()) { 5092 andnl(rax, rax, rdx); 5093 } else { 5094 notl(rax); 5095 andl(rax, rdx); 5096 } 5097 sarl(rax, 31); 5098 andl(rax, divisor); 5099 subl(rdx, rax); 5100 bind(done); 5101 } 5102 5103 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 5104 Label done; 5105 Label neg_divisor_fastpath; 5106 5107 cmpl(divisor, 0); 5108 jccb(Assembler::less, neg_divisor_fastpath); 5109 xorl(rdx, rdx); 5110 divl(divisor); 5111 jmpb(done); 5112 bind(neg_divisor_fastpath); 5113 // Fastpath for divisor < 0: 5114 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 5115 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 5116 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 5117 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 5118 movl(rdx, rax); 5119 subl(rax, divisor); 5120 if (VM_Version::supports_bmi1()) { 5121 andnl(rax, rax, rdx); 5122 } else { 5123 notl(rax); 5124 andl(rax, rdx); 5125 } 5126 movl(tmp, rax); 5127 shrl(rax, 31); // quotient 5128 sarl(tmp, 31); 5129 andl(tmp, divisor); 5130 subl(rdx, tmp); // remainder 5131 bind(done); 5132 } 5133 5134 #ifdef _LP64 5135 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 5136 Label done; 5137 Label neg_divisor_fastpath; 5138 cmpq(divisor, 0); 5139 jccb(Assembler::less, neg_divisor_fastpath); 5140 xorl(rdx, rdx); 5141 divq(divisor); 5142 jmpb(done); 5143 bind(neg_divisor_fastpath); 5144 // Fastpath for divisor < 0: 5145 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 5146 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 5147 movq(rdx, rax); 5148 subq(rdx, divisor); 5149 if (VM_Version::supports_bmi1()) { 5150 andnq(rax, rdx, rax); 5151 } else { 5152 notq(rdx); 5153 andq(rax, rdx); 5154 } 5155 shrq(rax, 63); 5156 bind(done); 5157 } 5158 5159 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 5160 Label done; 5161 Label neg_divisor_fastpath; 5162 cmpq(divisor, 0); 5163 jccb(Assembler::less, neg_divisor_fastpath); 5164 xorq(rdx, rdx); 5165 divq(divisor); 5166 jmp(done); 5167 bind(neg_divisor_fastpath); 5168 // Fastpath when divisor < 0: 5169 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 5170 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 5171 movq(rdx, rax); 5172 subq(rax, divisor); 5173 if (VM_Version::supports_bmi1()) { 5174 andnq(rax, rax, rdx); 5175 } else { 5176 notq(rax); 5177 andq(rax, rdx); 5178 } 5179 sarq(rax, 63); 5180 andq(rax, divisor); 5181 subq(rdx, rax); 5182 bind(done); 5183 } 5184 5185 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 5186 Label done; 5187 Label neg_divisor_fastpath; 5188 cmpq(divisor, 0); 5189 jccb(Assembler::less, neg_divisor_fastpath); 5190 xorq(rdx, rdx); 5191 divq(divisor); 5192 jmp(done); 5193 bind(neg_divisor_fastpath); 5194 // Fastpath for divisor < 0: 5195 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 5196 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 5197 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 5198 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 5199 movq(rdx, rax); 5200 subq(rax, divisor); 5201 if (VM_Version::supports_bmi1()) { 5202 andnq(rax, rax, rdx); 5203 } else { 5204 notq(rax); 5205 andq(rax, rdx); 5206 } 5207 movq(tmp, rax); 5208 shrq(rax, 63); // quotient 5209 sarq(tmp, 63); 5210 andq(tmp, divisor); 5211 subq(rdx, tmp); // remainder 5212 bind(done); 5213 } 5214 #endif