1 /* 2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "../../share/runtime/globals.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/globals.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/objectMonitorTable.hpp" 39 #include "runtime/stubRoutines.hpp" 40 #include "runtime/synchronizer.hpp" 41 #include "utilities/checkedCast.hpp" 42 #include "utilities/globalDefinitions.hpp" 43 #include "utilities/powerOfTwo.hpp" 44 #include "utilities/sizes.hpp" 45 46 #ifdef PRODUCT 47 #define BLOCK_COMMENT(str) /* nothing */ 48 #define STOP(error) stop(error) 49 #else 50 #define BLOCK_COMMENT(str) block_comment(str) 51 #define STOP(error) block_comment(error); stop(error) 52 #endif 53 54 // C2 compiled method's prolog code. 55 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 56 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 57 58 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 59 // Remove word for return addr 60 framesize -= wordSize; 61 stack_bang_size -= wordSize; 62 63 // Calls to C2R adapters often do not accept exceptional returns. 64 // We require that their callers must bang for them. But be careful, because 65 // some VM calls (such as call site linkage) can use several kilobytes of 66 // stack. But the stack safety zone should account for that. 67 // See bugs 4446381, 4468289, 4497237. 68 if (stack_bang_size > 0) { 69 generate_stack_overflow_check(stack_bang_size); 70 71 // We always push rbp, so that on return to interpreter rbp, will be 72 // restored correctly and we can correct the stack. 73 push(rbp); 74 // Save caller's stack pointer into RBP if the frame pointer is preserved. 75 if (PreserveFramePointer) { 76 mov(rbp, rsp); 77 } 78 // Remove word for ebp 79 framesize -= wordSize; 80 81 // Create frame 82 if (framesize) { 83 subptr(rsp, framesize); 84 } 85 } else { 86 subptr(rsp, framesize); 87 88 // Save RBP register now. 89 framesize -= wordSize; 90 movptr(Address(rsp, framesize), rbp); 91 // Save caller's stack pointer into RBP if the frame pointer is preserved. 92 if (PreserveFramePointer) { 93 movptr(rbp, rsp); 94 if (framesize > 0) { 95 addptr(rbp, framesize); 96 } 97 } 98 } 99 100 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 101 framesize -= wordSize; 102 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 103 } 104 105 #ifdef ASSERT 106 if (VerifyStackAtCalls) { 107 Label L; 108 push(rax); 109 mov(rax, rsp); 110 andptr(rax, StackAlignmentInBytes-1); 111 cmpptr(rax, StackAlignmentInBytes-wordSize); 112 pop(rax); 113 jcc(Assembler::equal, L); 114 STOP("Stack is not properly aligned!"); 115 bind(L); 116 } 117 #endif 118 119 if (!is_stub) { 120 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 121 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 122 Label dummy_slow_path; 123 Label dummy_continuation; 124 Label* slow_path = &dummy_slow_path; 125 Label* continuation = &dummy_continuation; 126 if (!Compile::current()->output()->in_scratch_emit_size()) { 127 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 128 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 129 Compile::current()->output()->add_stub(stub); 130 slow_path = &stub->entry(); 131 continuation = &stub->continuation(); 132 } 133 bs->nmethod_entry_barrier(this, slow_path, continuation); 134 } 135 } 136 137 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 138 switch (vlen_in_bytes) { 139 case 4: // fall-through 140 case 8: // fall-through 141 case 16: return Assembler::AVX_128bit; 142 case 32: return Assembler::AVX_256bit; 143 case 64: return Assembler::AVX_512bit; 144 145 default: { 146 ShouldNotReachHere(); 147 return Assembler::AVX_NoVec; 148 } 149 } 150 } 151 152 // fast_lock and fast_unlock used by C2 153 154 // Because the transitions from emitted code to the runtime 155 // monitorenter/exit helper stubs are so slow it's critical that 156 // we inline both the stack-locking fast path and the inflated fast path. 157 // 158 // See also: cmpFastLock and cmpFastUnlock. 159 // 160 // What follows is a specialized inline transliteration of the code 161 // in enter() and exit(). If we're concerned about I$ bloat another 162 // option would be to emit TrySlowEnter and TrySlowExit methods 163 // at startup-time. These methods would accept arguments as 164 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 165 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 166 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 167 // In practice, however, the # of lock sites is bounded and is usually small. 168 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 169 // if the processor uses simple bimodal branch predictors keyed by EIP 170 // Since the helper routines would be called from multiple synchronization 171 // sites. 172 // 173 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 174 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 175 // to those specialized methods. That'd give us a mostly platform-independent 176 // implementation that the JITs could optimize and inline at their pleasure. 177 // Done correctly, the only time we'd need to cross to native could would be 178 // to park() or unpark() threads. We'd also need a few more unsafe operators 179 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 180 // (b) explicit barriers or fence operations. 181 // 182 // TODO: 183 // 184 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 185 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 186 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 187 // the lock operators would typically be faster than reifying Self. 188 // 189 // * Ideally I'd define the primitives as: 190 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 191 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 192 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 193 // Instead, we're stuck with a rather awkward and brittle register assignments below. 194 // Furthermore the register assignments are overconstrained, possibly resulting in 195 // sub-optimal code near the synchronization site. 196 // 197 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 198 // Alternately, use a better sp-proximity test. 199 // 200 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 201 // Either one is sufficient to uniquely identify a thread. 202 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 203 // 204 // * Intrinsify notify() and notifyAll() for the common cases where the 205 // object is locked by the calling thread but the waitlist is empty. 206 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 207 // 208 // * use jccb and jmpb instead of jcc and jmp to improve code density. 209 // But beware of excessive branch density on AMD Opterons. 210 // 211 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 212 // or failure of the fast path. If the fast path fails then we pass 213 // control to the slow path, typically in C. In fast_lock and 214 // fast_unlock we often branch to DONE_LABEL, just to find that C2 215 // will emit a conditional branch immediately after the node. 216 // So we have branches to branches and lots of ICC.ZF games. 217 // Instead, it might be better to have C2 pass a "FailureLabel" 218 // into fast_lock and fast_unlock. In the case of success, control 219 // will drop through the node. ICC.ZF is undefined at exit. 220 // In the case of failure, the node will branch directly to the 221 // FailureLabel 222 223 // obj: object to lock 224 // box: on-stack box address -- KILLED 225 // rax: tmp -- KILLED 226 // t : tmp -- KILLED 227 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg, 228 Register t, Register thread) { 229 assert(rax_reg == rax, "Used for CAS"); 230 assert_different_registers(obj, box, rax_reg, t, thread); 231 232 // Handle inflated monitor. 233 Label inflated; 234 // Finish fast lock successfully. ZF value is irrelevant. 235 Label locked; 236 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 237 Label slow_path; 238 239 if (UseObjectMonitorTable) { 240 // Clear cache in case fast locking succeeds or we need to take the slow-path. 241 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 242 } 243 244 if (DiagnoseSyncOnValueBasedClasses != 0) { 245 load_klass(rax_reg, obj, t); 246 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 247 jcc(Assembler::notZero, slow_path); 248 } 249 250 const Register mark = t; 251 252 { // Fast Lock 253 254 Label push; 255 256 const Register top = UseObjectMonitorTable ? rax_reg : box; 257 258 // Load the mark. 259 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 260 261 // Prefetch top. 262 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 263 264 // Check for monitor (0b10). 265 testptr(mark, markWord::monitor_value); 266 jcc(Assembler::notZero, inflated); 267 268 // Check if lock-stack is full. 269 cmpl(top, LockStack::end_offset() - 1); 270 jcc(Assembler::greater, slow_path); 271 272 // Check if recursive. 273 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 274 jccb(Assembler::equal, push); 275 276 // Try to lock. Transition lock bits 0b01 => 0b00 277 movptr(rax_reg, mark); 278 orptr(rax_reg, markWord::unlocked_value); 279 andptr(mark, ~(int32_t)markWord::unlocked_value); 280 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 281 jcc(Assembler::notEqual, slow_path); 282 283 if (UseObjectMonitorTable) { 284 // Need to reload top, clobbered by CAS. 285 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 286 } 287 bind(push); 288 // After successful lock, push object on lock-stack. 289 movptr(Address(thread, top), obj); 290 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 291 jmp(locked); 292 } 293 294 { // Handle inflated monitor. 295 bind(inflated); 296 297 const Register monitor = t; 298 299 if (!UseObjectMonitorTable) { 300 assert(mark == monitor, "should be the same here"); 301 } else { 302 const Register hash = t; 303 Label monitor_found; 304 305 // Look for the monitor in the om_cache. 306 307 ByteSize cache_offset = JavaThread::om_cache_oops_offset(); 308 ByteSize monitor_offset = OMCache::oop_to_monitor_difference(); 309 const int num_unrolled = OMCache::CAPACITY; 310 for (int i = 0; i < num_unrolled; i++) { 311 movptr(monitor, Address(thread, cache_offset + monitor_offset)); 312 cmpptr(obj, Address(thread, cache_offset)); 313 jccb(Assembler::equal, monitor_found); 314 cache_offset = cache_offset + OMCache::oop_to_oop_difference(); 315 } 316 317 if (UseCompactObjectHeaders) { 318 // TODO: The fast-path table lookup currently doesn't work with Lilliput's 319 // compact identity-hashcode implementation. 320 // See: https://bugs.openjdk.org/browse/JDK-8380981 321 jmp(slow_path); 322 } else { 323 // Look for the monitor in the table. 324 325 // Get the hash code. 326 movptr(hash, Address(obj, oopDesc::mark_offset_in_bytes())); 327 shrq(hash, markWord::hash_shift); 328 andq(hash, markWord::hash_mask); 329 330 // Get the table and calculate the bucket's address. 331 lea(rax_reg, ExternalAddress(ObjectMonitorTable::current_table_address())); 332 movptr(rax_reg, Address(rax_reg)); 333 andq(hash, Address(rax_reg, ObjectMonitorTable::table_capacity_mask_offset())); 334 movptr(rax_reg, Address(rax_reg, ObjectMonitorTable::table_buckets_offset())); 335 336 // Read the monitor from the bucket. 337 movptr(monitor, Address(rax_reg, hash, Address::times_ptr)); 338 339 // Check if the monitor in the bucket is special (empty, tombstone or removed) 340 cmpptr(monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special); 341 jcc(Assembler::below, slow_path); 342 343 // Check if object matches. 344 movptr(rax_reg, Address(monitor, ObjectMonitor::object_offset())); 345 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 346 bs_asm->try_resolve_weak_handle_in_c2(this, rax_reg, slow_path); 347 cmpptr(rax_reg, obj); 348 jcc(Assembler::notEqual, slow_path); 349 } 350 bind(monitor_found); 351 } 352 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 353 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 354 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 355 356 Label monitor_locked; 357 // Lock the monitor. 358 359 if (UseObjectMonitorTable) { 360 // Cache the monitor for unlock before trashing box. On failure to acquire 361 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 362 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 363 } 364 365 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 366 xorptr(rax_reg, rax_reg); 367 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset())); 368 lock(); cmpxchgptr(box, owner_address); 369 jccb(Assembler::equal, monitor_locked); 370 371 // Check if recursive. 372 cmpptr(box, rax_reg); 373 jccb(Assembler::notEqual, slow_path); 374 375 // Recursive. 376 increment(recursions_address); 377 378 bind(monitor_locked); 379 } 380 381 bind(locked); 382 // Set ZF = 1 383 xorl(rax_reg, rax_reg); 384 385 #ifdef ASSERT 386 // Check that locked label is reached with ZF set. 387 Label zf_correct; 388 Label zf_bad_zero; 389 jcc(Assembler::zero, zf_correct); 390 jmp(zf_bad_zero); 391 #endif 392 393 bind(slow_path); 394 #ifdef ASSERT 395 // Check that slow_path label is reached with ZF not set. 396 jcc(Assembler::notZero, zf_correct); 397 stop("Fast Lock ZF != 0"); 398 bind(zf_bad_zero); 399 stop("Fast Lock ZF != 1"); 400 bind(zf_correct); 401 #endif 402 // C2 uses the value of ZF to determine the continuation. 403 } 404 405 // obj: object to lock 406 // rax: tmp -- KILLED 407 // t : tmp - cannot be obj nor rax -- KILLED 408 // 409 // Some commentary on balanced locking: 410 // 411 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 412 // Methods that don't have provably balanced locking are forced to run in the 413 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 414 // The interpreter provides two properties: 415 // I1: At return-time the interpreter automatically and quietly unlocks any 416 // objects acquired in the current activation (frame). Recall that the 417 // interpreter maintains an on-stack list of locks currently held by 418 // a frame. 419 // I2: If a method attempts to unlock an object that is not held by the 420 // frame the interpreter throws IMSX. 421 // 422 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 423 // B() doesn't have provably balanced locking so it runs in the interpreter. 424 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 425 // is still locked by A(). 426 // 427 // The only other source of unbalanced locking would be JNI. The "Java Native Interface 428 // Specification" states that an object locked by JNI's MonitorEnter should not be 429 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't 430 // specify what will occur if a program engages in such mixed-mode locking, however. 431 // Arguably given that the spec legislates the JNI case as undefined our implementation 432 // could reasonably *avoid* checking owner in fast_unlock(). 433 // In the interest of performance we elide m->Owner==Self check in unlock. 434 // A perfectly viable alternative is to elide the owner check except when 435 // Xcheck:jni is enabled. 436 437 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) { 438 assert(reg_rax == rax, "Used for CAS"); 439 assert_different_registers(obj, reg_rax, t); 440 441 // Handle inflated monitor. 442 Label inflated, inflated_check_lock_stack; 443 // Finish fast unlock successfully. MUST jump with ZF == 1 444 Label unlocked, slow_path; 445 446 const Register mark = t; 447 const Register monitor = t; 448 const Register top = UseObjectMonitorTable ? t : reg_rax; 449 const Register box = reg_rax; 450 451 Label dummy; 452 C2FastUnlockStub* stub = nullptr; 453 454 if (!Compile::current()->output()->in_scratch_emit_size()) { 455 stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread); 456 Compile::current()->output()->add_stub(stub); 457 } 458 459 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 460 461 { // Fast Unlock 462 463 // Load top. 464 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 465 466 if (!UseObjectMonitorTable) { 467 // Prefetch mark. 468 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 469 } 470 471 // Check if obj is top of lock-stack. 472 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 473 // Top of lock stack was not obj. Must be monitor. 474 jcc(Assembler::notEqual, inflated_check_lock_stack); 475 476 // Pop lock-stack. 477 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 478 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 479 480 // Check if recursive. 481 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 482 jcc(Assembler::equal, unlocked); 483 484 // We elide the monitor check, let the CAS fail instead. 485 486 if (UseObjectMonitorTable) { 487 // Load mark. 488 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 489 } 490 491 // Try to unlock. Transition lock bits 0b00 => 0b01 492 movptr(reg_rax, mark); 493 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 494 orptr(mark, markWord::unlocked_value); 495 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 496 jcc(Assembler::notEqual, push_and_slow_path); 497 jmp(unlocked); 498 } 499 500 501 { // Handle inflated monitor. 502 bind(inflated_check_lock_stack); 503 #ifdef ASSERT 504 Label check_done; 505 subl(top, oopSize); 506 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 507 jcc(Assembler::below, check_done); 508 cmpptr(obj, Address(thread, top)); 509 jcc(Assembler::notEqual, inflated_check_lock_stack); 510 stop("Fast Unlock lock on stack"); 511 bind(check_done); 512 if (UseObjectMonitorTable) { 513 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 514 } 515 testptr(mark, markWord::monitor_value); 516 jcc(Assembler::notZero, inflated); 517 stop("Fast Unlock not monitor"); 518 #endif 519 520 bind(inflated); 521 522 if (!UseObjectMonitorTable) { 523 assert(mark == monitor, "should be the same here"); 524 } else { 525 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 526 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 527 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 528 cmpptr(monitor, alignof(ObjectMonitor*)); 529 jcc(Assembler::below, slow_path); 530 } 531 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 532 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 533 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 534 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag}; 535 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 536 537 Label recursive; 538 539 // Check if recursive. 540 cmpptr(recursions_address, 0); 541 jcc(Assembler::notZero, recursive); 542 543 // Set owner to null. 544 // Release to satisfy the JMM 545 movptr(owner_address, NULL_WORD); 546 // We need a full fence after clearing owner to avoid stranding. 547 // StoreLoad achieves this. 548 membar(StoreLoad); 549 550 // Check if the entry_list is empty. 551 cmpptr(entry_list_address, NULL_WORD); 552 jcc(Assembler::zero, unlocked); // If so we are done. 553 554 // Check if there is a successor. 555 cmpptr(succ_address, NULL_WORD); 556 jcc(Assembler::notZero, unlocked); // If so we are done. 557 558 // Save the monitor pointer in the current thread, so we can try to 559 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 560 if (!UseObjectMonitorTable) { 561 andptr(monitor, ~(int32_t)markWord::monitor_value); 562 } 563 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 564 565 orl(t, 1); // Fast Unlock ZF = 0 566 jmpb(slow_path); 567 568 // Recursive unlock. 569 bind(recursive); 570 decrement(recursions_address); 571 } 572 573 bind(unlocked); 574 xorl(t, t); // Fast Unlock ZF = 1 575 576 #ifdef ASSERT 577 // Check that unlocked label is reached with ZF set. 578 Label zf_correct; 579 Label zf_bad_zero; 580 jcc(Assembler::zero, zf_correct); 581 jmp(zf_bad_zero); 582 #endif 583 584 bind(slow_path); 585 if (stub != nullptr) { 586 bind(stub->slow_path_continuation()); 587 } 588 #ifdef ASSERT 589 // Check that stub->continuation() label is reached with ZF not set. 590 jcc(Assembler::notZero, zf_correct); 591 stop("Fast Unlock ZF != 0"); 592 bind(zf_bad_zero); 593 stop("Fast Unlock ZF != 1"); 594 bind(zf_correct); 595 #endif 596 // C2 uses the value of ZF to determine the continuation. 597 } 598 599 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) { 600 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi); 601 } 602 603 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) { 604 const int framesize = Compile::current()->output()->frame_size_in_bytes(); 605 masm->movptr(dst, rsp); 606 if (framesize > 2 * wordSize) { 607 masm->addptr(dst, framesize - 2 * wordSize); 608 } 609 } 610 611 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) { 612 if (PreserveFramePointer) { 613 // frame pointer is valid 614 #ifdef ASSERT 615 // Verify frame pointer value in rbp. 616 reconstruct_frame_pointer_helper(this, rtmp); 617 Label L_success; 618 cmpq(rbp, rtmp); 619 jccb(Assembler::equal, L_success); 620 STOP("frame pointer mismatch"); 621 bind(L_success); 622 #endif // ASSERT 623 } else { 624 reconstruct_frame_pointer_helper(this, rbp); 625 } 626 } 627 628 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) { 629 jint lo = t->_lo; 630 jint hi = t->_hi; 631 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi); 632 if (t == TypeInt::INT) { 633 return; 634 } 635 636 BLOCK_COMMENT("CastII {"); 637 Label fail; 638 Label succeed; 639 640 if (lo != min_jint) { 641 cmpl(val, lo); 642 jccb(Assembler::less, fail); 643 } 644 if (hi != max_jint) { 645 cmpl(val, hi); 646 jccb(Assembler::greater, fail); 647 } 648 jmpb(succeed); 649 650 bind(fail); 651 movl(c_rarg0, idx); 652 movl(c_rarg1, val); 653 movl(c_rarg2, lo); 654 movl(c_rarg3, hi); 655 reconstruct_frame_pointer(rscratch1); 656 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range))); 657 hlt(); 658 bind(succeed); 659 BLOCK_COMMENT("} // CastII"); 660 } 661 662 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) { 663 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi); 664 } 665 666 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) { 667 jlong lo = t->_lo; 668 jlong hi = t->_hi; 669 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi); 670 if (t == TypeLong::LONG) { 671 return; 672 } 673 674 BLOCK_COMMENT("CastLL {"); 675 Label fail; 676 Label succeed; 677 678 auto cmp_val = [&](jlong bound) { 679 if (is_simm32(bound)) { 680 cmpq(val, checked_cast<int>(bound)); 681 } else { 682 mov64(tmp, bound); 683 cmpq(val, tmp); 684 } 685 }; 686 687 if (lo != min_jlong) { 688 cmp_val(lo); 689 jccb(Assembler::less, fail); 690 } 691 if (hi != max_jlong) { 692 cmp_val(hi); 693 jccb(Assembler::greater, fail); 694 } 695 jmpb(succeed); 696 697 bind(fail); 698 movl(c_rarg0, idx); 699 movq(c_rarg1, val); 700 mov64(c_rarg2, lo); 701 mov64(c_rarg3, hi); 702 reconstruct_frame_pointer(rscratch1); 703 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range))); 704 hlt(); 705 bind(succeed); 706 BLOCK_COMMENT("} // CastLL"); 707 } 708 709 //------------------------------------------------------------------------------------------- 710 // Generic instructions support for use in .ad files C2 code generation 711 712 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 713 if (dst != src) { 714 movdqu(dst, src); 715 } 716 if (opcode == Op_AbsVD) { 717 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 718 } else { 719 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 720 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 721 } 722 } 723 724 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 725 if (opcode == Op_AbsVD) { 726 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 727 } else { 728 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 729 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 730 } 731 } 732 733 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 734 if (dst != src) { 735 movdqu(dst, src); 736 } 737 if (opcode == Op_AbsVF) { 738 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 739 } else { 740 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 741 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 742 } 743 } 744 745 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 746 if (opcode == Op_AbsVF) { 747 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 748 } else { 749 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 750 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 751 } 752 } 753 754 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 755 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 756 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 757 758 if (opcode == Op_MinV) { 759 if (elem_bt == T_BYTE) { 760 pminsb(dst, src); 761 } else if (elem_bt == T_SHORT) { 762 pminsw(dst, src); 763 } else if (elem_bt == T_INT) { 764 pminsd(dst, src); 765 } else { 766 assert(elem_bt == T_LONG, "required"); 767 assert(tmp == xmm0, "required"); 768 assert_different_registers(dst, src, tmp); 769 movdqu(xmm0, dst); 770 pcmpgtq(xmm0, src); 771 blendvpd(dst, src); // xmm0 as mask 772 } 773 } else { // opcode == Op_MaxV 774 if (elem_bt == T_BYTE) { 775 pmaxsb(dst, src); 776 } else if (elem_bt == T_SHORT) { 777 pmaxsw(dst, src); 778 } else if (elem_bt == T_INT) { 779 pmaxsd(dst, src); 780 } else { 781 assert(elem_bt == T_LONG, "required"); 782 assert(tmp == xmm0, "required"); 783 assert_different_registers(dst, src, tmp); 784 movdqu(xmm0, src); 785 pcmpgtq(xmm0, dst); 786 blendvpd(dst, src); // xmm0 as mask 787 } 788 } 789 } 790 791 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 792 XMMRegister src1, Address src2, int vlen_enc) { 793 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 794 if (opcode == Op_UMinV) { 795 switch(elem_bt) { 796 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 797 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 798 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 799 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 800 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 801 } 802 } else { 803 assert(opcode == Op_UMaxV, "required"); 804 switch(elem_bt) { 805 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 806 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 807 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 808 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 809 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 810 } 811 } 812 } 813 814 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 815 // For optimality, leverage a full vector width of 512 bits 816 // for operations over smaller vector sizes on AVX512 targets. 817 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 818 if (opcode == Op_UMaxV) { 819 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 820 } else { 821 assert(opcode == Op_UMinV, "required"); 822 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 823 } 824 } else { 825 // T1 = -1 826 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 827 // T1 = -1 << 63 828 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 829 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 830 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 831 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 832 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 833 // Mask = T2 > T1 834 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 835 if (opcode == Op_UMaxV) { 836 // Res = Mask ? Src2 : Src1 837 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 838 } else { 839 // Res = Mask ? Src1 : Src2 840 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 841 } 842 } 843 } 844 845 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 846 XMMRegister src1, XMMRegister src2, int vlen_enc) { 847 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 848 if (opcode == Op_UMinV) { 849 switch(elem_bt) { 850 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 851 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 852 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 853 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 854 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 855 } 856 } else { 857 assert(opcode == Op_UMaxV, "required"); 858 switch(elem_bt) { 859 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 860 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 861 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 862 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 863 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 864 } 865 } 866 } 867 868 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 869 XMMRegister dst, XMMRegister src1, XMMRegister src2, 870 int vlen_enc) { 871 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 872 873 if (opcode == Op_MinV) { 874 if (elem_bt == T_BYTE) { 875 vpminsb(dst, src1, src2, vlen_enc); 876 } else if (elem_bt == T_SHORT) { 877 vpminsw(dst, src1, src2, vlen_enc); 878 } else if (elem_bt == T_INT) { 879 vpminsd(dst, src1, src2, vlen_enc); 880 } else { 881 assert(elem_bt == T_LONG, "required"); 882 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 883 vpminsq(dst, src1, src2, vlen_enc); 884 } else { 885 assert_different_registers(dst, src1, src2); 886 vpcmpgtq(dst, src1, src2, vlen_enc); 887 vblendvpd(dst, src1, src2, dst, vlen_enc); 888 } 889 } 890 } else { // opcode == Op_MaxV 891 if (elem_bt == T_BYTE) { 892 vpmaxsb(dst, src1, src2, vlen_enc); 893 } else if (elem_bt == T_SHORT) { 894 vpmaxsw(dst, src1, src2, vlen_enc); 895 } else if (elem_bt == T_INT) { 896 vpmaxsd(dst, src1, src2, vlen_enc); 897 } else { 898 assert(elem_bt == T_LONG, "required"); 899 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 900 vpmaxsq(dst, src1, src2, vlen_enc); 901 } else { 902 assert_different_registers(dst, src1, src2); 903 vpcmpgtq(dst, src1, src2, vlen_enc); 904 vblendvpd(dst, src2, src1, dst, vlen_enc); 905 } 906 } 907 } 908 } 909 910 // Float/Double min max 911 912 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 913 XMMRegister dst, XMMRegister a, XMMRegister b, 914 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 915 int vlen_enc) { 916 assert(UseAVX > 0, "required"); 917 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 918 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 919 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 920 assert_different_registers(a, tmp, atmp, btmp); 921 assert_different_registers(b, tmp, atmp, btmp); 922 923 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 924 bool is_double_word = is_double_word_type(elem_bt); 925 926 /* Note on 'non-obvious' assembly sequence: 927 * 928 * While there are vminps/vmaxps instructions, there are two important differences between hardware 929 * and Java on how they handle floats: 930 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 931 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 932 * 933 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 934 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 935 * (only useful when signs differ, noop otherwise) 936 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 937 938 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 939 * btmp = (b < +0.0) ? a : b 940 * atmp = (b < +0.0) ? b : a 941 * Tmp = Max_Float(atmp , btmp) 942 * Res = (atmp == NaN) ? atmp : Tmp 943 */ 944 945 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 946 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 947 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 948 XMMRegister mask; 949 950 if (!is_double_word && is_min) { 951 mask = a; 952 vblend = &MacroAssembler::vblendvps; 953 vmaxmin = &MacroAssembler::vminps; 954 vcmp = &MacroAssembler::vcmpps; 955 } else if (!is_double_word && !is_min) { 956 mask = b; 957 vblend = &MacroAssembler::vblendvps; 958 vmaxmin = &MacroAssembler::vmaxps; 959 vcmp = &MacroAssembler::vcmpps; 960 } else if (is_double_word && is_min) { 961 mask = a; 962 vblend = &MacroAssembler::vblendvpd; 963 vmaxmin = &MacroAssembler::vminpd; 964 vcmp = &MacroAssembler::vcmppd; 965 } else { 966 assert(is_double_word && !is_min, "sanity"); 967 mask = b; 968 vblend = &MacroAssembler::vblendvpd; 969 vmaxmin = &MacroAssembler::vmaxpd; 970 vcmp = &MacroAssembler::vcmppd; 971 } 972 973 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 974 XMMRegister maxmin, scratch; 975 if (dst == btmp) { 976 maxmin = btmp; 977 scratch = tmp; 978 } else { 979 maxmin = tmp; 980 scratch = btmp; 981 } 982 983 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 984 if (precompute_mask && !is_double_word) { 985 vpsrad(tmp, mask, 32, vlen_enc); 986 mask = tmp; 987 } else if (precompute_mask && is_double_word) { 988 vpxor(tmp, tmp, tmp, vlen_enc); 989 vpcmpgtq(tmp, tmp, mask, vlen_enc); 990 mask = tmp; 991 } 992 993 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 994 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 995 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 996 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 997 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 998 } 999 1000 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1001 XMMRegister dst, XMMRegister a, XMMRegister b, 1002 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1003 int vlen_enc) { 1004 assert(UseAVX > 2, "required"); 1005 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1006 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1007 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1008 assert_different_registers(dst, a, atmp, btmp); 1009 assert_different_registers(dst, b, atmp, btmp); 1010 1011 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1012 bool is_double_word = is_double_word_type(elem_bt); 1013 bool merge = true; 1014 1015 if (!is_double_word && is_min) { 1016 evpmovd2m(ktmp, a, vlen_enc); 1017 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1018 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1019 vminps(dst, atmp, btmp, vlen_enc); 1020 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1021 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1022 } else if (!is_double_word && !is_min) { 1023 evpmovd2m(ktmp, b, vlen_enc); 1024 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1025 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1026 vmaxps(dst, atmp, btmp, vlen_enc); 1027 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1028 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1029 } else if (is_double_word && is_min) { 1030 evpmovq2m(ktmp, a, vlen_enc); 1031 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1032 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1033 vminpd(dst, atmp, btmp, vlen_enc); 1034 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1035 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1036 } else { 1037 assert(is_double_word && !is_min, "sanity"); 1038 evpmovq2m(ktmp, b, vlen_enc); 1039 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1040 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1041 vmaxpd(dst, atmp, btmp, vlen_enc); 1042 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1043 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1044 } 1045 } 1046 1047 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask, 1048 XMMRegister src1, XMMRegister src2, int vlen_enc) { 1049 assert(opc == Op_MinV || opc == Op_MinReductionV || 1050 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity"); 1051 1052 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN 1053 : AVX10_2_MINMAX_MAX_COMPARE_SIGN; 1054 if (elem_bt == T_FLOAT) { 1055 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc); 1056 } else { 1057 assert(elem_bt == T_DOUBLE, ""); 1058 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc); 1059 } 1060 } 1061 1062 // Float/Double signum 1063 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1064 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1065 1066 Label DONE_LABEL; 1067 1068 // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument 1069 // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases 1070 // If other floating point comparison instructions used, ZF=1 for equal and unordered cases 1071 if (opcode == Op_SignumF) { 1072 if (VM_Version::supports_avx10_2()) { 1073 vucomxss(dst, zero); 1074 jcc(Assembler::negative, DONE_LABEL); 1075 } else { 1076 ucomiss(dst, zero); 1077 jcc(Assembler::equal, DONE_LABEL); 1078 } 1079 movflt(dst, one); 1080 jcc(Assembler::above, DONE_LABEL); 1081 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1082 } else if (opcode == Op_SignumD) { 1083 if (VM_Version::supports_avx10_2()) { 1084 vucomxsd(dst, zero); 1085 jcc(Assembler::negative, DONE_LABEL); 1086 } else { 1087 ucomisd(dst, zero); 1088 jcc(Assembler::equal, DONE_LABEL); 1089 } 1090 movdbl(dst, one); 1091 jcc(Assembler::above, DONE_LABEL); 1092 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1093 } 1094 1095 bind(DONE_LABEL); 1096 } 1097 1098 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1099 if (sign) { 1100 pmovsxbw(dst, src); 1101 } else { 1102 pmovzxbw(dst, src); 1103 } 1104 } 1105 1106 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1107 if (sign) { 1108 vpmovsxbw(dst, src, vector_len); 1109 } else { 1110 vpmovzxbw(dst, src, vector_len); 1111 } 1112 } 1113 1114 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1115 if (sign) { 1116 vpmovsxbd(dst, src, vector_len); 1117 } else { 1118 vpmovzxbd(dst, src, vector_len); 1119 } 1120 } 1121 1122 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1123 if (sign) { 1124 vpmovsxwd(dst, src, vector_len); 1125 } else { 1126 vpmovzxwd(dst, src, vector_len); 1127 } 1128 } 1129 1130 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1131 int shift, int vector_len) { 1132 if (opcode == Op_RotateLeftV) { 1133 if (etype == T_INT) { 1134 evprold(dst, src, shift, vector_len); 1135 } else { 1136 assert(etype == T_LONG, "expected type T_LONG"); 1137 evprolq(dst, src, shift, vector_len); 1138 } 1139 } else { 1140 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1141 if (etype == T_INT) { 1142 evprord(dst, src, shift, vector_len); 1143 } else { 1144 assert(etype == T_LONG, "expected type T_LONG"); 1145 evprorq(dst, src, shift, vector_len); 1146 } 1147 } 1148 } 1149 1150 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1151 XMMRegister shift, int vector_len) { 1152 if (opcode == Op_RotateLeftV) { 1153 if (etype == T_INT) { 1154 evprolvd(dst, src, shift, vector_len); 1155 } else { 1156 assert(etype == T_LONG, "expected type T_LONG"); 1157 evprolvq(dst, src, shift, vector_len); 1158 } 1159 } else { 1160 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1161 if (etype == T_INT) { 1162 evprorvd(dst, src, shift, vector_len); 1163 } else { 1164 assert(etype == T_LONG, "expected type T_LONG"); 1165 evprorvq(dst, src, shift, vector_len); 1166 } 1167 } 1168 } 1169 1170 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1171 if (opcode == Op_RShiftVI) { 1172 psrad(dst, shift); 1173 } else if (opcode == Op_LShiftVI) { 1174 pslld(dst, shift); 1175 } else { 1176 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1177 psrld(dst, shift); 1178 } 1179 } 1180 1181 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1182 switch (opcode) { 1183 case Op_RShiftVI: psrad(dst, shift); break; 1184 case Op_LShiftVI: pslld(dst, shift); break; 1185 case Op_URShiftVI: psrld(dst, shift); break; 1186 1187 default: assert(false, "%s", NodeClassNames[opcode]); 1188 } 1189 } 1190 1191 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1192 if (opcode == Op_RShiftVI) { 1193 vpsrad(dst, nds, shift, vector_len); 1194 } else if (opcode == Op_LShiftVI) { 1195 vpslld(dst, nds, shift, vector_len); 1196 } else { 1197 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1198 vpsrld(dst, nds, shift, vector_len); 1199 } 1200 } 1201 1202 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1203 switch (opcode) { 1204 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1205 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1206 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1207 1208 default: assert(false, "%s", NodeClassNames[opcode]); 1209 } 1210 } 1211 1212 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1213 switch (opcode) { 1214 case Op_RShiftVB: // fall-through 1215 case Op_RShiftVS: psraw(dst, shift); break; 1216 1217 case Op_LShiftVB: // fall-through 1218 case Op_LShiftVS: psllw(dst, shift); break; 1219 1220 case Op_URShiftVS: // fall-through 1221 case Op_URShiftVB: psrlw(dst, shift); break; 1222 1223 default: assert(false, "%s", NodeClassNames[opcode]); 1224 } 1225 } 1226 1227 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1228 switch (opcode) { 1229 case Op_RShiftVB: // fall-through 1230 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1231 1232 case Op_LShiftVB: // fall-through 1233 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1234 1235 case Op_URShiftVS: // fall-through 1236 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1237 1238 default: assert(false, "%s", NodeClassNames[opcode]); 1239 } 1240 } 1241 1242 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1243 switch (opcode) { 1244 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1245 case Op_LShiftVL: psllq(dst, shift); break; 1246 case Op_URShiftVL: psrlq(dst, shift); break; 1247 1248 default: assert(false, "%s", NodeClassNames[opcode]); 1249 } 1250 } 1251 1252 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1253 if (opcode == Op_RShiftVL) { 1254 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1255 } else if (opcode == Op_LShiftVL) { 1256 psllq(dst, shift); 1257 } else { 1258 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1259 psrlq(dst, shift); 1260 } 1261 } 1262 1263 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1264 switch (opcode) { 1265 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1266 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1267 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1268 1269 default: assert(false, "%s", NodeClassNames[opcode]); 1270 } 1271 } 1272 1273 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1274 if (opcode == Op_RShiftVL) { 1275 evpsraq(dst, nds, shift, vector_len); 1276 } else if (opcode == Op_LShiftVL) { 1277 vpsllq(dst, nds, shift, vector_len); 1278 } else { 1279 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1280 vpsrlq(dst, nds, shift, vector_len); 1281 } 1282 } 1283 1284 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1285 switch (opcode) { 1286 case Op_RShiftVB: // fall-through 1287 case Op_RShiftVS: // fall-through 1288 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1289 1290 case Op_LShiftVB: // fall-through 1291 case Op_LShiftVS: // fall-through 1292 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1293 1294 case Op_URShiftVB: // fall-through 1295 case Op_URShiftVS: // fall-through 1296 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1297 1298 default: assert(false, "%s", NodeClassNames[opcode]); 1299 } 1300 } 1301 1302 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1303 switch (opcode) { 1304 case Op_RShiftVB: // fall-through 1305 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1306 1307 case Op_LShiftVB: // fall-through 1308 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1309 1310 case Op_URShiftVB: // fall-through 1311 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1312 1313 default: assert(false, "%s", NodeClassNames[opcode]); 1314 } 1315 } 1316 1317 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1318 assert(UseAVX >= 2, "required"); 1319 switch (opcode) { 1320 case Op_RShiftVL: { 1321 if (UseAVX > 2) { 1322 assert(tmp == xnoreg, "not used"); 1323 if (!VM_Version::supports_avx512vl()) { 1324 vlen_enc = Assembler::AVX_512bit; 1325 } 1326 evpsravq(dst, src, shift, vlen_enc); 1327 } else { 1328 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1329 vpsrlvq(dst, src, shift, vlen_enc); 1330 vpsrlvq(tmp, tmp, shift, vlen_enc); 1331 vpxor(dst, dst, tmp, vlen_enc); 1332 vpsubq(dst, dst, tmp, vlen_enc); 1333 } 1334 break; 1335 } 1336 case Op_LShiftVL: { 1337 assert(tmp == xnoreg, "not used"); 1338 vpsllvq(dst, src, shift, vlen_enc); 1339 break; 1340 } 1341 case Op_URShiftVL: { 1342 assert(tmp == xnoreg, "not used"); 1343 vpsrlvq(dst, src, shift, vlen_enc); 1344 break; 1345 } 1346 default: assert(false, "%s", NodeClassNames[opcode]); 1347 } 1348 } 1349 1350 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1351 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1352 assert(opcode == Op_LShiftVB || 1353 opcode == Op_RShiftVB || 1354 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1355 bool sign = (opcode != Op_URShiftVB); 1356 assert(vector_len == 0, "required"); 1357 vextendbd(sign, dst, src, 1); 1358 vpmovzxbd(vtmp, shift, 1); 1359 varshiftd(opcode, dst, dst, vtmp, 1); 1360 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1361 vextracti128_high(vtmp, dst); 1362 vpackusdw(dst, dst, vtmp, 0); 1363 } 1364 1365 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1366 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1367 assert(opcode == Op_LShiftVB || 1368 opcode == Op_RShiftVB || 1369 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1370 bool sign = (opcode != Op_URShiftVB); 1371 int ext_vector_len = vector_len + 1; 1372 vextendbw(sign, dst, src, ext_vector_len); 1373 vpmovzxbw(vtmp, shift, ext_vector_len); 1374 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1375 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1376 if (vector_len == 0) { 1377 vextracti128_high(vtmp, dst); 1378 vpackuswb(dst, dst, vtmp, vector_len); 1379 } else { 1380 vextracti64x4_high(vtmp, dst); 1381 vpackuswb(dst, dst, vtmp, vector_len); 1382 vpermq(dst, dst, 0xD8, vector_len); 1383 } 1384 } 1385 1386 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1387 switch(typ) { 1388 case T_BYTE: 1389 pinsrb(dst, val, idx); 1390 break; 1391 case T_SHORT: 1392 pinsrw(dst, val, idx); 1393 break; 1394 case T_INT: 1395 pinsrd(dst, val, idx); 1396 break; 1397 case T_LONG: 1398 pinsrq(dst, val, idx); 1399 break; 1400 default: 1401 assert(false,"Should not reach here."); 1402 break; 1403 } 1404 } 1405 1406 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1407 switch(typ) { 1408 case T_BYTE: 1409 vpinsrb(dst, src, val, idx); 1410 break; 1411 case T_SHORT: 1412 vpinsrw(dst, src, val, idx); 1413 break; 1414 case T_INT: 1415 vpinsrd(dst, src, val, idx); 1416 break; 1417 case T_LONG: 1418 vpinsrq(dst, src, val, idx); 1419 break; 1420 default: 1421 assert(false,"Should not reach here."); 1422 break; 1423 } 1424 } 1425 1426 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst, 1427 Register base, Register idx_base, 1428 Register mask, Register mask_idx, 1429 Register rtmp, int vlen_enc) { 1430 vpxor(dst, dst, dst, vlen_enc); 1431 if (elem_bt == T_SHORT) { 1432 for (int i = 0; i < 4; i++) { 1433 // dst[i] = mask[i] ? src[idx_base[i]] : 0 1434 Label skip_load; 1435 btq(mask, mask_idx); 1436 jccb(Assembler::carryClear, skip_load); 1437 movl(rtmp, Address(idx_base, i * 4)); 1438 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1439 bind(skip_load); 1440 incq(mask_idx); 1441 } 1442 } else { 1443 assert(elem_bt == T_BYTE, ""); 1444 for (int i = 0; i < 8; i++) { 1445 // dst[i] = mask[i] ? src[idx_base[i]] : 0 1446 Label skip_load; 1447 btq(mask, mask_idx); 1448 jccb(Assembler::carryClear, skip_load); 1449 movl(rtmp, Address(idx_base, i * 4)); 1450 pinsrb(dst, Address(base, rtmp), i); 1451 bind(skip_load); 1452 incq(mask_idx); 1453 } 1454 } 1455 } 1456 1457 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst, 1458 Register base, Register idx_base, 1459 Register rtmp, int vlen_enc) { 1460 vpxor(dst, dst, dst, vlen_enc); 1461 if (elem_bt == T_SHORT) { 1462 for (int i = 0; i < 4; i++) { 1463 // dst[i] = src[idx_base[i]] 1464 movl(rtmp, Address(idx_base, i * 4)); 1465 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1466 } 1467 } else { 1468 assert(elem_bt == T_BYTE, ""); 1469 for (int i = 0; i < 8; i++) { 1470 // dst[i] = src[idx_base[i]] 1471 movl(rtmp, Address(idx_base, i * 4)); 1472 pinsrb(dst, Address(base, rtmp), i); 1473 } 1474 } 1475 } 1476 1477 /* 1478 * Gather using hybrid algorithm, first partially unroll scalar loop 1479 * to accumulate values from gather indices into a quad-word(64bit) slice. 1480 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1481 * permutation to place the slice into appropriate vector lane 1482 * locations in destination vector. Following pseudo code describes the 1483 * algorithm in detail: 1484 * 1485 * DST_VEC = ZERO_VEC 1486 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1487 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1488 * FOREACH_ITER: 1489 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1490 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1491 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1492 * PERM_INDEX = PERM_INDEX - TWO_VEC 1493 * 1494 * With each iteration, doubleword permute indices (0,1) corresponding 1495 * to gathered quadword gets right shifted by two lane positions. 1496 * 1497 */ 1498 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1499 Register base, Register idx_base, 1500 Register mask, XMMRegister xtmp1, 1501 XMMRegister xtmp2, XMMRegister temp_dst, 1502 Register rtmp, Register mask_idx, 1503 Register length, int vector_len, int vlen_enc) { 1504 Label GATHER8_LOOP; 1505 assert(is_subword_type(elem_ty), ""); 1506 movl(length, vector_len); 1507 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1508 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1509 vallones(xtmp2, vlen_enc); 1510 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1511 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1512 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1513 1514 bind(GATHER8_LOOP); 1515 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1516 if (mask == noreg) { 1517 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc); 1518 } else { 1519 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc); 1520 } 1521 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1522 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1523 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1524 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1525 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1526 vpor(dst, dst, temp_dst, vlen_enc); 1527 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1528 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1529 jcc(Assembler::notEqual, GATHER8_LOOP); 1530 } 1531 1532 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1533 switch(typ) { 1534 case T_INT: 1535 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1536 break; 1537 case T_FLOAT: 1538 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1539 break; 1540 case T_LONG: 1541 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1542 break; 1543 case T_DOUBLE: 1544 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1545 break; 1546 default: 1547 assert(false,"Should not reach here."); 1548 break; 1549 } 1550 } 1551 1552 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1553 switch(typ) { 1554 case T_INT: 1555 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1556 break; 1557 case T_FLOAT: 1558 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1559 break; 1560 case T_LONG: 1561 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1562 break; 1563 case T_DOUBLE: 1564 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1565 break; 1566 default: 1567 assert(false,"Should not reach here."); 1568 break; 1569 } 1570 } 1571 1572 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1573 switch(typ) { 1574 case T_INT: 1575 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1576 break; 1577 case T_FLOAT: 1578 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1579 break; 1580 case T_LONG: 1581 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1582 break; 1583 case T_DOUBLE: 1584 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1585 break; 1586 default: 1587 assert(false,"Should not reach here."); 1588 break; 1589 } 1590 } 1591 1592 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1593 if (vlen_in_bytes <= 16) { 1594 pxor (dst, dst); 1595 psubb(dst, src); 1596 switch (elem_bt) { 1597 case T_BYTE: /* nothing to do */ break; 1598 case T_SHORT: pmovsxbw(dst, dst); break; 1599 case T_INT: pmovsxbd(dst, dst); break; 1600 case T_FLOAT: pmovsxbd(dst, dst); break; 1601 case T_LONG: pmovsxbq(dst, dst); break; 1602 case T_DOUBLE: pmovsxbq(dst, dst); break; 1603 1604 default: assert(false, "%s", type2name(elem_bt)); 1605 } 1606 } else { 1607 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1608 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1609 1610 vpxor (dst, dst, dst, vlen_enc); 1611 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1612 1613 switch (elem_bt) { 1614 case T_BYTE: /* nothing to do */ break; 1615 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1616 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1617 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1618 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1619 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1620 1621 default: assert(false, "%s", type2name(elem_bt)); 1622 } 1623 } 1624 } 1625 1626 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1627 if (novlbwdq) { 1628 vpmovsxbd(xtmp, src, vlen_enc); 1629 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1630 Assembler::eq, true, vlen_enc, noreg); 1631 } else { 1632 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1633 vpsubb(xtmp, xtmp, src, vlen_enc); 1634 evpmovb2m(dst, xtmp, vlen_enc); 1635 } 1636 } 1637 1638 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) { 1639 if (is_integral_type(bt)) { 1640 switch (vlen_in_bytes) { 1641 case 4: movdl(dst, src); break; 1642 case 8: movq(dst, src); break; 1643 case 16: movdqu(dst, src); break; 1644 case 32: vmovdqu(dst, src); break; 1645 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1646 default: ShouldNotReachHere(); 1647 } 1648 } else { 1649 switch (vlen_in_bytes) { 1650 case 4: movflt(dst, src); break; 1651 case 8: movdbl(dst, src); break; 1652 case 16: movups(dst, src); break; 1653 case 32: vmovups(dst, src, Assembler::AVX_256bit); break; 1654 case 64: vmovups(dst, src, Assembler::AVX_512bit); break; 1655 default: ShouldNotReachHere(); 1656 } 1657 } 1658 } 1659 1660 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1661 assert(rscratch != noreg || always_reachable(src), "missing"); 1662 1663 if (reachable(src)) { 1664 load_vector(bt, dst, as_Address(src), vlen_in_bytes); 1665 } else { 1666 lea(rscratch, src); 1667 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes); 1668 } 1669 } 1670 1671 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1672 int vlen_enc = vector_length_encoding(vlen); 1673 if (VM_Version::supports_avx()) { 1674 if (bt == T_LONG) { 1675 if (VM_Version::supports_avx2()) { 1676 vpbroadcastq(dst, src, vlen_enc); 1677 } else { 1678 vmovddup(dst, src, vlen_enc); 1679 } 1680 } else if (bt == T_DOUBLE) { 1681 if (vlen_enc != Assembler::AVX_128bit) { 1682 vbroadcastsd(dst, src, vlen_enc, noreg); 1683 } else { 1684 vmovddup(dst, src, vlen_enc); 1685 } 1686 } else { 1687 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1688 vpbroadcastd(dst, src, vlen_enc); 1689 } else { 1690 vbroadcastss(dst, src, vlen_enc); 1691 } 1692 } 1693 } else if (VM_Version::supports_sse3()) { 1694 movddup(dst, src); 1695 } else { 1696 load_vector(bt, dst, src, vlen); 1697 } 1698 } 1699 1700 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1701 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1702 int offset = exact_log2(type2aelembytes(bt)) << 6; 1703 if (is_floating_point_type(bt)) { 1704 offset += 128; 1705 } 1706 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1707 load_vector(T_BYTE, dst, addr, vlen_in_bytes); 1708 } 1709 1710 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1711 1712 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1713 int vector_len = Assembler::AVX_128bit; 1714 1715 switch (opcode) { 1716 case Op_AndReductionV: pand(dst, src); break; 1717 case Op_OrReductionV: por (dst, src); break; 1718 case Op_XorReductionV: pxor(dst, src); break; 1719 case Op_MinReductionV: 1720 switch (typ) { 1721 case T_BYTE: pminsb(dst, src); break; 1722 case T_SHORT: pminsw(dst, src); break; 1723 case T_INT: pminsd(dst, src); break; 1724 case T_LONG: assert(UseAVX > 2, "required"); 1725 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1726 default: assert(false, "wrong type"); 1727 } 1728 break; 1729 case Op_MaxReductionV: 1730 switch (typ) { 1731 case T_BYTE: pmaxsb(dst, src); break; 1732 case T_SHORT: pmaxsw(dst, src); break; 1733 case T_INT: pmaxsd(dst, src); break; 1734 case T_LONG: assert(UseAVX > 2, "required"); 1735 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1736 default: assert(false, "wrong type"); 1737 } 1738 break; 1739 case Op_UMinReductionV: 1740 switch (typ) { 1741 case T_BYTE: vpminub(dst, dst, src, Assembler::AVX_128bit); break; 1742 case T_SHORT: vpminuw(dst, dst, src, Assembler::AVX_128bit); break; 1743 case T_INT: vpminud(dst, dst, src, Assembler::AVX_128bit); break; 1744 case T_LONG: evpminuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break; 1745 default: assert(false, "wrong type"); 1746 } 1747 break; 1748 case Op_UMaxReductionV: 1749 switch (typ) { 1750 case T_BYTE: vpmaxub(dst, dst, src, Assembler::AVX_128bit); break; 1751 case T_SHORT: vpmaxuw(dst, dst, src, Assembler::AVX_128bit); break; 1752 case T_INT: vpmaxud(dst, dst, src, Assembler::AVX_128bit); break; 1753 case T_LONG: evpmaxuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break; 1754 default: assert(false, "wrong type"); 1755 } 1756 break; 1757 case Op_AddReductionVF: addss(dst, src); break; 1758 case Op_AddReductionVD: addsd(dst, src); break; 1759 case Op_AddReductionVI: 1760 switch (typ) { 1761 case T_BYTE: paddb(dst, src); break; 1762 case T_SHORT: paddw(dst, src); break; 1763 case T_INT: paddd(dst, src); break; 1764 default: assert(false, "wrong type"); 1765 } 1766 break; 1767 case Op_AddReductionVL: paddq(dst, src); break; 1768 case Op_MulReductionVF: mulss(dst, src); break; 1769 case Op_MulReductionVD: mulsd(dst, src); break; 1770 case Op_MulReductionVI: 1771 switch (typ) { 1772 case T_SHORT: pmullw(dst, src); break; 1773 case T_INT: pmulld(dst, src); break; 1774 default: assert(false, "wrong type"); 1775 } 1776 break; 1777 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1778 evpmullq(dst, dst, src, vector_len); break; 1779 default: assert(false, "wrong opcode"); 1780 } 1781 } 1782 1783 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1784 switch (opcode) { 1785 case Op_AddReductionVF: addps(dst, src); break; 1786 case Op_AddReductionVD: addpd(dst, src); break; 1787 case Op_MulReductionVF: mulps(dst, src); break; 1788 case Op_MulReductionVD: mulpd(dst, src); break; 1789 default: assert(false, "%s", NodeClassNames[opcode]); 1790 } 1791 } 1792 1793 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1794 int vector_len = Assembler::AVX_256bit; 1795 1796 switch (opcode) { 1797 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1798 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1799 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1800 case Op_MinReductionV: 1801 switch (typ) { 1802 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1803 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1804 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1805 case T_LONG: assert(UseAVX > 2, "required"); 1806 vpminsq(dst, src1, src2, vector_len); break; 1807 default: assert(false, "wrong type"); 1808 } 1809 break; 1810 case Op_MaxReductionV: 1811 switch (typ) { 1812 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1813 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1814 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1815 case T_LONG: assert(UseAVX > 2, "required"); 1816 vpmaxsq(dst, src1, src2, vector_len); break; 1817 default: assert(false, "wrong type"); 1818 } 1819 break; 1820 case Op_UMinReductionV: 1821 switch (typ) { 1822 case T_BYTE: vpminub(dst, src1, src2, vector_len); break; 1823 case T_SHORT: vpminuw(dst, src1, src2, vector_len); break; 1824 case T_INT: vpminud(dst, src1, src2, vector_len); break; 1825 case T_LONG: evpminuq(dst, k0, src1, src2, true, vector_len); break; 1826 default: assert(false, "wrong type"); 1827 } 1828 break; 1829 case Op_UMaxReductionV: 1830 switch (typ) { 1831 case T_BYTE: vpmaxub(dst, src1, src2, vector_len); break; 1832 case T_SHORT: vpmaxuw(dst, src1, src2, vector_len); break; 1833 case T_INT: vpmaxud(dst, src1, src2, vector_len); break; 1834 case T_LONG: evpmaxuq(dst, k0, src1, src2, true, vector_len); break; 1835 default: assert(false, "wrong type"); 1836 } 1837 break; 1838 case Op_AddReductionVI: 1839 switch (typ) { 1840 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1841 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1842 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1843 default: assert(false, "wrong type"); 1844 } 1845 break; 1846 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1847 case Op_MulReductionVI: 1848 switch (typ) { 1849 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1850 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1851 default: assert(false, "wrong type"); 1852 } 1853 break; 1854 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1855 default: assert(false, "wrong opcode"); 1856 } 1857 } 1858 1859 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1860 int vector_len = Assembler::AVX_256bit; 1861 1862 switch (opcode) { 1863 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1864 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1865 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1866 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1867 default: assert(false, "%s", NodeClassNames[opcode]); 1868 } 1869 } 1870 1871 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1872 XMMRegister dst, XMMRegister src, 1873 XMMRegister vtmp1, XMMRegister vtmp2) { 1874 switch (opcode) { 1875 case Op_AddReductionVF: 1876 case Op_MulReductionVF: 1877 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1878 break; 1879 1880 case Op_AddReductionVD: 1881 case Op_MulReductionVD: 1882 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1883 break; 1884 1885 default: assert(false, "wrong opcode"); 1886 } 1887 } 1888 1889 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1890 XMMRegister dst, XMMRegister src, 1891 XMMRegister vtmp1, XMMRegister vtmp2) { 1892 switch (opcode) { 1893 case Op_AddReductionVF: 1894 case Op_MulReductionVF: 1895 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1896 break; 1897 1898 case Op_AddReductionVD: 1899 case Op_MulReductionVD: 1900 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1901 break; 1902 1903 default: assert(false, "%s", NodeClassNames[opcode]); 1904 } 1905 } 1906 1907 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1908 Register dst, Register src1, XMMRegister src2, 1909 XMMRegister vtmp1, XMMRegister vtmp2) { 1910 switch (vlen) { 1911 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1912 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1913 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1914 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1915 1916 default: assert(false, "wrong vector length"); 1917 } 1918 } 1919 1920 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1921 Register dst, Register src1, XMMRegister src2, 1922 XMMRegister vtmp1, XMMRegister vtmp2) { 1923 switch (vlen) { 1924 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1925 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1926 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1927 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1928 1929 default: assert(false, "wrong vector length"); 1930 } 1931 } 1932 1933 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1934 Register dst, Register src1, XMMRegister src2, 1935 XMMRegister vtmp1, XMMRegister vtmp2) { 1936 switch (vlen) { 1937 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1938 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1939 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1940 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1941 1942 default: assert(false, "wrong vector length"); 1943 } 1944 } 1945 1946 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1947 Register dst, Register src1, XMMRegister src2, 1948 XMMRegister vtmp1, XMMRegister vtmp2) { 1949 switch (vlen) { 1950 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1951 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1952 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1953 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1954 1955 default: assert(false, "wrong vector length"); 1956 } 1957 } 1958 1959 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1960 Register dst, Register src1, XMMRegister src2, 1961 XMMRegister vtmp1, XMMRegister vtmp2) { 1962 switch (vlen) { 1963 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1964 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1965 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1966 1967 default: assert(false, "wrong vector length"); 1968 } 1969 } 1970 1971 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1972 switch (vlen) { 1973 case 2: 1974 assert(vtmp2 == xnoreg, ""); 1975 reduce2F(opcode, dst, src, vtmp1); 1976 break; 1977 case 4: 1978 assert(vtmp2 == xnoreg, ""); 1979 reduce4F(opcode, dst, src, vtmp1); 1980 break; 1981 case 8: 1982 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1983 break; 1984 case 16: 1985 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1986 break; 1987 default: assert(false, "wrong vector length"); 1988 } 1989 } 1990 1991 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1992 switch (vlen) { 1993 case 2: 1994 assert(vtmp2 == xnoreg, ""); 1995 reduce2D(opcode, dst, src, vtmp1); 1996 break; 1997 case 4: 1998 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1999 break; 2000 case 8: 2001 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2002 break; 2003 default: assert(false, "wrong vector length"); 2004 } 2005 } 2006 2007 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2008 switch (vlen) { 2009 case 2: 2010 assert(vtmp1 == xnoreg, ""); 2011 assert(vtmp2 == xnoreg, ""); 2012 unorderedReduce2F(opcode, dst, src); 2013 break; 2014 case 4: 2015 assert(vtmp2 == xnoreg, ""); 2016 unorderedReduce4F(opcode, dst, src, vtmp1); 2017 break; 2018 case 8: 2019 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2020 break; 2021 case 16: 2022 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2023 break; 2024 default: assert(false, "wrong vector length"); 2025 } 2026 } 2027 2028 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2029 switch (vlen) { 2030 case 2: 2031 assert(vtmp1 == xnoreg, ""); 2032 assert(vtmp2 == xnoreg, ""); 2033 unorderedReduce2D(opcode, dst, src); 2034 break; 2035 case 4: 2036 assert(vtmp2 == xnoreg, ""); 2037 unorderedReduce4D(opcode, dst, src, vtmp1); 2038 break; 2039 case 8: 2040 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2041 break; 2042 default: assert(false, "wrong vector length"); 2043 } 2044 } 2045 2046 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2047 if (opcode == Op_AddReductionVI) { 2048 if (vtmp1 != src2) { 2049 movdqu(vtmp1, src2); 2050 } 2051 phaddd(vtmp1, vtmp1); 2052 } else { 2053 pshufd(vtmp1, src2, 0x1); 2054 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2055 } 2056 movdl(vtmp2, src1); 2057 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2058 movdl(dst, vtmp1); 2059 } 2060 2061 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2062 if (opcode == Op_AddReductionVI) { 2063 if (vtmp1 != src2) { 2064 movdqu(vtmp1, src2); 2065 } 2066 phaddd(vtmp1, src2); 2067 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2068 } else { 2069 pshufd(vtmp2, src2, 0xE); 2070 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2071 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2072 } 2073 } 2074 2075 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2076 if (opcode == Op_AddReductionVI) { 2077 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2078 vextracti128_high(vtmp2, vtmp1); 2079 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2080 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2081 } else { 2082 vextracti128_high(vtmp1, src2); 2083 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2084 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2085 } 2086 } 2087 2088 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2089 vextracti64x4_high(vtmp2, src2); 2090 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2091 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2092 } 2093 2094 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2095 pshufd(vtmp2, src2, 0x1); 2096 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2097 movdqu(vtmp1, vtmp2); 2098 psrldq(vtmp1, 2); 2099 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2100 movdqu(vtmp2, vtmp1); 2101 psrldq(vtmp2, 1); 2102 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2103 movdl(vtmp2, src1); 2104 if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) { 2105 pmovzxbd(vtmp1, vtmp1); 2106 } else { 2107 pmovsxbd(vtmp1, vtmp1); 2108 } 2109 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2110 pextrb(dst, vtmp1, 0x0); 2111 movsbl(dst, dst); 2112 } 2113 2114 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2115 pshufd(vtmp1, src2, 0xE); 2116 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2117 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2118 } 2119 2120 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2121 vextracti128_high(vtmp2, src2); 2122 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2123 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2124 } 2125 2126 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2127 vextracti64x4_high(vtmp1, src2); 2128 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2129 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2130 } 2131 2132 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2133 pmovsxbw(vtmp2, src2); 2134 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2135 } 2136 2137 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2138 if (UseAVX > 1) { 2139 int vector_len = Assembler::AVX_256bit; 2140 vpmovsxbw(vtmp1, src2, vector_len); 2141 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2142 } else { 2143 pmovsxbw(vtmp2, src2); 2144 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2145 pshufd(vtmp2, src2, 0x1); 2146 pmovsxbw(vtmp2, src2); 2147 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2148 } 2149 } 2150 2151 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2152 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2153 int vector_len = Assembler::AVX_512bit; 2154 vpmovsxbw(vtmp1, src2, vector_len); 2155 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2156 } else { 2157 assert(UseAVX >= 2,"Should not reach here."); 2158 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2159 vextracti128_high(vtmp2, src2); 2160 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2161 } 2162 } 2163 2164 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2165 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2166 vextracti64x4_high(vtmp2, src2); 2167 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2168 } 2169 2170 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2171 if (opcode == Op_AddReductionVI) { 2172 if (vtmp1 != src2) { 2173 movdqu(vtmp1, src2); 2174 } 2175 phaddw(vtmp1, vtmp1); 2176 phaddw(vtmp1, vtmp1); 2177 } else { 2178 pshufd(vtmp2, src2, 0x1); 2179 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2180 movdqu(vtmp1, vtmp2); 2181 psrldq(vtmp1, 2); 2182 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2183 } 2184 movdl(vtmp2, src1); 2185 if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) { 2186 pmovzxwd(vtmp1, vtmp1); 2187 } else { 2188 pmovsxwd(vtmp1, vtmp1); 2189 } 2190 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2191 pextrw(dst, vtmp1, 0x0); 2192 movswl(dst, dst); 2193 } 2194 2195 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2196 if (opcode == Op_AddReductionVI) { 2197 if (vtmp1 != src2) { 2198 movdqu(vtmp1, src2); 2199 } 2200 phaddw(vtmp1, src2); 2201 } else { 2202 pshufd(vtmp1, src2, 0xE); 2203 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2204 } 2205 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2206 } 2207 2208 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2209 if (opcode == Op_AddReductionVI) { 2210 int vector_len = Assembler::AVX_256bit; 2211 vphaddw(vtmp2, src2, src2, vector_len); 2212 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2213 } else { 2214 vextracti128_high(vtmp2, src2); 2215 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2216 } 2217 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2218 } 2219 2220 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2221 int vector_len = Assembler::AVX_256bit; 2222 vextracti64x4_high(vtmp1, src2); 2223 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2224 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2225 } 2226 2227 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2228 pshufd(vtmp2, src2, 0xE); 2229 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2230 movdq(vtmp1, src1); 2231 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2232 movdq(dst, vtmp1); 2233 } 2234 2235 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2236 vextracti128_high(vtmp1, src2); 2237 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2238 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2239 } 2240 2241 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2242 vextracti64x4_high(vtmp2, src2); 2243 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2244 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2245 } 2246 2247 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2248 mov64(temp, -1L); 2249 bzhiq(temp, temp, len); 2250 kmovql(dst, temp); 2251 } 2252 2253 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2254 reduce_operation_128(T_FLOAT, opcode, dst, src); 2255 pshufd(vtmp, src, 0x1); 2256 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2257 } 2258 2259 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2260 reduce2F(opcode, dst, src, vtmp); 2261 pshufd(vtmp, src, 0x2); 2262 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2263 pshufd(vtmp, src, 0x3); 2264 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2265 } 2266 2267 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2268 reduce4F(opcode, dst, src, vtmp2); 2269 vextractf128_high(vtmp2, src); 2270 reduce4F(opcode, dst, vtmp2, vtmp1); 2271 } 2272 2273 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2274 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2275 vextracti64x4_high(vtmp1, src); 2276 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2277 } 2278 2279 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2280 pshufd(dst, src, 0x1); 2281 reduce_operation_128(T_FLOAT, opcode, dst, src); 2282 } 2283 2284 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2285 pshufd(vtmp, src, 0xE); 2286 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2287 unorderedReduce2F(opcode, dst, vtmp); 2288 } 2289 2290 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2291 vextractf128_high(vtmp1, src); 2292 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2293 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2294 } 2295 2296 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2297 vextractf64x4_high(vtmp2, src); 2298 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2299 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2300 } 2301 2302 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2303 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2304 pshufd(vtmp, src, 0xE); 2305 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2306 } 2307 2308 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2309 reduce2D(opcode, dst, src, vtmp2); 2310 vextractf128_high(vtmp2, src); 2311 reduce2D(opcode, dst, vtmp2, vtmp1); 2312 } 2313 2314 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2315 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2316 vextracti64x4_high(vtmp1, src); 2317 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2318 } 2319 2320 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2321 pshufd(dst, src, 0xE); 2322 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2323 } 2324 2325 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2326 vextractf128_high(vtmp, src); 2327 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2328 unorderedReduce2D(opcode, dst, vtmp); 2329 } 2330 2331 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2332 vextractf64x4_high(vtmp2, src); 2333 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2334 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2335 } 2336 2337 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2338 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2339 } 2340 2341 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2342 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2343 } 2344 2345 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2346 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2347 } 2348 2349 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2350 int vec_enc) { 2351 switch(elem_bt) { 2352 case T_INT: 2353 case T_FLOAT: 2354 vmaskmovps(dst, src, mask, vec_enc); 2355 break; 2356 case T_LONG: 2357 case T_DOUBLE: 2358 vmaskmovpd(dst, src, mask, vec_enc); 2359 break; 2360 default: 2361 fatal("Unsupported type %s", type2name(elem_bt)); 2362 break; 2363 } 2364 } 2365 2366 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2367 int vec_enc) { 2368 switch(elem_bt) { 2369 case T_INT: 2370 case T_FLOAT: 2371 vmaskmovps(dst, src, mask, vec_enc); 2372 break; 2373 case T_LONG: 2374 case T_DOUBLE: 2375 vmaskmovpd(dst, src, mask, vec_enc); 2376 break; 2377 default: 2378 fatal("Unsupported type %s", type2name(elem_bt)); 2379 break; 2380 } 2381 } 2382 2383 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2384 XMMRegister dst, XMMRegister src, 2385 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2386 XMMRegister xmm_0, XMMRegister xmm_1) { 2387 const int permconst[] = {1, 14}; 2388 XMMRegister wsrc = src; 2389 XMMRegister wdst = xmm_0; 2390 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2391 2392 int vlen_enc = Assembler::AVX_128bit; 2393 if (vlen == 16) { 2394 vlen_enc = Assembler::AVX_256bit; 2395 } 2396 2397 for (int i = log2(vlen) - 1; i >=0; i--) { 2398 if (i == 0 && !is_dst_valid) { 2399 wdst = dst; 2400 } 2401 if (i == 3) { 2402 vextracti64x4_high(wtmp, wsrc); 2403 } else if (i == 2) { 2404 vextracti128_high(wtmp, wsrc); 2405 } else { // i = [0,1] 2406 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2407 } 2408 2409 if (VM_Version::supports_avx10_2()) { 2410 vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc); 2411 } else { 2412 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2413 } 2414 wsrc = wdst; 2415 vlen_enc = Assembler::AVX_128bit; 2416 } 2417 if (is_dst_valid) { 2418 if (VM_Version::supports_avx10_2()) { 2419 vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit); 2420 } else { 2421 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2422 } 2423 } 2424 } 2425 2426 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2427 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2428 XMMRegister xmm_0, XMMRegister xmm_1) { 2429 XMMRegister wsrc = src; 2430 XMMRegister wdst = xmm_0; 2431 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2432 int vlen_enc = Assembler::AVX_128bit; 2433 if (vlen == 8) { 2434 vlen_enc = Assembler::AVX_256bit; 2435 } 2436 for (int i = log2(vlen) - 1; i >=0; i--) { 2437 if (i == 0 && !is_dst_valid) { 2438 wdst = dst; 2439 } 2440 if (i == 1) { 2441 vextracti128_high(wtmp, wsrc); 2442 } else if (i == 2) { 2443 vextracti64x4_high(wtmp, wsrc); 2444 } else { 2445 assert(i == 0, "%d", i); 2446 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2447 } 2448 2449 if (VM_Version::supports_avx10_2()) { 2450 vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc); 2451 } else { 2452 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2453 } 2454 2455 wsrc = wdst; 2456 vlen_enc = Assembler::AVX_128bit; 2457 } 2458 2459 if (is_dst_valid) { 2460 if (VM_Version::supports_avx10_2()) { 2461 vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit); 2462 } else { 2463 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2464 } 2465 } 2466 } 2467 2468 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2469 switch (bt) { 2470 case T_BYTE: pextrb(dst, src, idx); break; 2471 case T_SHORT: pextrw(dst, src, idx); break; 2472 case T_INT: pextrd(dst, src, idx); break; 2473 case T_LONG: pextrq(dst, src, idx); break; 2474 2475 default: 2476 assert(false,"Should not reach here."); 2477 break; 2478 } 2479 } 2480 2481 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2482 int esize = type2aelembytes(typ); 2483 int elem_per_lane = 16/esize; 2484 int lane = elemindex / elem_per_lane; 2485 int eindex = elemindex % elem_per_lane; 2486 2487 if (lane >= 2) { 2488 assert(UseAVX > 2, "required"); 2489 vextractf32x4(dst, src, lane & 3); 2490 return dst; 2491 } else if (lane > 0) { 2492 assert(UseAVX > 0, "required"); 2493 vextractf128(dst, src, lane); 2494 return dst; 2495 } else { 2496 return src; 2497 } 2498 } 2499 2500 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2501 if (typ == T_BYTE) { 2502 movsbl(dst, dst); 2503 } else if (typ == T_SHORT) { 2504 movswl(dst, dst); 2505 } 2506 } 2507 2508 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2509 int esize = type2aelembytes(typ); 2510 int elem_per_lane = 16/esize; 2511 int eindex = elemindex % elem_per_lane; 2512 assert(is_integral_type(typ),"required"); 2513 2514 if (eindex == 0) { 2515 if (typ == T_LONG) { 2516 movq(dst, src); 2517 } else { 2518 movdl(dst, src); 2519 movsxl(typ, dst); 2520 } 2521 } else { 2522 extract(typ, dst, src, eindex); 2523 movsxl(typ, dst); 2524 } 2525 } 2526 2527 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2528 int esize = type2aelembytes(typ); 2529 int elem_per_lane = 16/esize; 2530 int eindex = elemindex % elem_per_lane; 2531 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2532 2533 if (eindex == 0) { 2534 movq(dst, src); 2535 } else { 2536 if (typ == T_FLOAT) { 2537 if (UseAVX == 0) { 2538 movdqu(dst, src); 2539 shufps(dst, dst, eindex); 2540 } else { 2541 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2542 } 2543 } else { 2544 if (UseAVX == 0) { 2545 movdqu(dst, src); 2546 psrldq(dst, eindex*esize); 2547 } else { 2548 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2549 } 2550 movq(dst, dst); 2551 } 2552 } 2553 // Zero upper bits 2554 if (typ == T_FLOAT) { 2555 if (UseAVX == 0) { 2556 assert(vtmp != xnoreg, "required."); 2557 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2558 pand(dst, vtmp); 2559 } else { 2560 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2561 } 2562 } 2563 } 2564 2565 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2566 switch(typ) { 2567 case T_BYTE: 2568 case T_BOOLEAN: 2569 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2570 break; 2571 case T_SHORT: 2572 case T_CHAR: 2573 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2574 break; 2575 case T_INT: 2576 case T_FLOAT: 2577 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2578 break; 2579 case T_LONG: 2580 case T_DOUBLE: 2581 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2582 break; 2583 default: 2584 assert(false,"Should not reach here."); 2585 break; 2586 } 2587 } 2588 2589 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2590 assert(rscratch != noreg || always_reachable(src2), "missing"); 2591 2592 switch(typ) { 2593 case T_BOOLEAN: 2594 case T_BYTE: 2595 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2596 break; 2597 case T_CHAR: 2598 case T_SHORT: 2599 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2600 break; 2601 case T_INT: 2602 case T_FLOAT: 2603 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2604 break; 2605 case T_LONG: 2606 case T_DOUBLE: 2607 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2608 break; 2609 default: 2610 assert(false,"Should not reach here."); 2611 break; 2612 } 2613 } 2614 2615 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2616 switch(typ) { 2617 case T_BYTE: 2618 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2619 break; 2620 case T_SHORT: 2621 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2622 break; 2623 case T_INT: 2624 case T_FLOAT: 2625 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2626 break; 2627 case T_LONG: 2628 case T_DOUBLE: 2629 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2630 break; 2631 default: 2632 assert(false,"Should not reach here."); 2633 break; 2634 } 2635 } 2636 2637 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2638 assert(vlen_in_bytes <= 32, ""); 2639 int esize = type2aelembytes(bt); 2640 if (vlen_in_bytes == 32) { 2641 assert(vtmp == xnoreg, "required."); 2642 if (esize >= 4) { 2643 vtestps(src1, src2, AVX_256bit); 2644 } else { 2645 vptest(src1, src2, AVX_256bit); 2646 } 2647 return; 2648 } 2649 if (vlen_in_bytes < 16) { 2650 // Duplicate the lower part to fill the whole register, 2651 // Don't need to do so for src2 2652 assert(vtmp != xnoreg, "required"); 2653 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2654 pshufd(vtmp, src1, shuffle_imm); 2655 } else { 2656 assert(vtmp == xnoreg, "required"); 2657 vtmp = src1; 2658 } 2659 if (esize >= 4 && VM_Version::supports_avx()) { 2660 vtestps(vtmp, src2, AVX_128bit); 2661 } else { 2662 ptest(vtmp, src2); 2663 } 2664 } 2665 2666 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2667 #ifdef ASSERT 2668 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2669 bool is_bw_supported = VM_Version::supports_avx512bw(); 2670 if (is_bw && !is_bw_supported) { 2671 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2672 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2673 "XMM register should be 0-15"); 2674 } 2675 #endif // ASSERT 2676 switch (elem_bt) { 2677 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2678 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2679 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2680 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2681 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2682 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2683 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2684 } 2685 } 2686 2687 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2688 assert(UseAVX >= 2, "required"); 2689 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2690 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2691 if ((UseAVX > 2) && 2692 (!is_bw || VM_Version::supports_avx512bw()) && 2693 (!is_vl || VM_Version::supports_avx512vl())) { 2694 switch (elem_bt) { 2695 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2696 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2697 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2698 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2699 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2700 } 2701 } else { 2702 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2703 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2704 switch (elem_bt) { 2705 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2706 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2707 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2708 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2709 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2710 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2711 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2712 } 2713 } 2714 } 2715 2716 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2717 switch (to_elem_bt) { 2718 case T_SHORT: 2719 vpmovsxbw(dst, src, vlen_enc); 2720 break; 2721 case T_INT: 2722 vpmovsxbd(dst, src, vlen_enc); 2723 break; 2724 case T_FLOAT: 2725 vpmovsxbd(dst, src, vlen_enc); 2726 vcvtdq2ps(dst, dst, vlen_enc); 2727 break; 2728 case T_LONG: 2729 vpmovsxbq(dst, src, vlen_enc); 2730 break; 2731 case T_DOUBLE: { 2732 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2733 vpmovsxbd(dst, src, mid_vlen_enc); 2734 vcvtdq2pd(dst, dst, vlen_enc); 2735 break; 2736 } 2737 default: 2738 fatal("Unsupported type %s", type2name(to_elem_bt)); 2739 break; 2740 } 2741 } 2742 2743 //------------------------------------------------------------------------------------------- 2744 2745 // IndexOf for constant substrings with size >= 8 chars 2746 // which don't need to be loaded through stack. 2747 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2748 Register cnt1, Register cnt2, 2749 int int_cnt2, Register result, 2750 XMMRegister vec, Register tmp, 2751 int ae) { 2752 ShortBranchVerifier sbv(this); 2753 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2754 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2755 2756 // This method uses the pcmpestri instruction with bound registers 2757 // inputs: 2758 // xmm - substring 2759 // rax - substring length (elements count) 2760 // mem - scanned string 2761 // rdx - string length (elements count) 2762 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2763 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2764 // outputs: 2765 // rcx - matched index in string 2766 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2767 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2768 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2769 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2770 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2771 2772 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2773 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2774 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2775 2776 // Note, inline_string_indexOf() generates checks: 2777 // if (substr.count > string.count) return -1; 2778 // if (substr.count == 0) return 0; 2779 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2780 2781 // Load substring. 2782 if (ae == StrIntrinsicNode::UL) { 2783 pmovzxbw(vec, Address(str2, 0)); 2784 } else { 2785 movdqu(vec, Address(str2, 0)); 2786 } 2787 movl(cnt2, int_cnt2); 2788 movptr(result, str1); // string addr 2789 2790 if (int_cnt2 > stride) { 2791 jmpb(SCAN_TO_SUBSTR); 2792 2793 // Reload substr for rescan, this code 2794 // is executed only for large substrings (> 8 chars) 2795 bind(RELOAD_SUBSTR); 2796 if (ae == StrIntrinsicNode::UL) { 2797 pmovzxbw(vec, Address(str2, 0)); 2798 } else { 2799 movdqu(vec, Address(str2, 0)); 2800 } 2801 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2802 2803 bind(RELOAD_STR); 2804 // We came here after the beginning of the substring was 2805 // matched but the rest of it was not so we need to search 2806 // again. Start from the next element after the previous match. 2807 2808 // cnt2 is number of substring reminding elements and 2809 // cnt1 is number of string reminding elements when cmp failed. 2810 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2811 subl(cnt1, cnt2); 2812 addl(cnt1, int_cnt2); 2813 movl(cnt2, int_cnt2); // Now restore cnt2 2814 2815 decrementl(cnt1); // Shift to next element 2816 cmpl(cnt1, cnt2); 2817 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2818 2819 addptr(result, (1<<scale1)); 2820 2821 } // (int_cnt2 > 8) 2822 2823 // Scan string for start of substr in 16-byte vectors 2824 bind(SCAN_TO_SUBSTR); 2825 pcmpestri(vec, Address(result, 0), mode); 2826 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2827 subl(cnt1, stride); 2828 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2829 cmpl(cnt1, cnt2); 2830 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2831 addptr(result, 16); 2832 jmpb(SCAN_TO_SUBSTR); 2833 2834 // Found a potential substr 2835 bind(FOUND_CANDIDATE); 2836 // Matched whole vector if first element matched (tmp(rcx) == 0). 2837 if (int_cnt2 == stride) { 2838 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2839 } else { // int_cnt2 > 8 2840 jccb(Assembler::overflow, FOUND_SUBSTR); 2841 } 2842 // After pcmpestri tmp(rcx) contains matched element index 2843 // Compute start addr of substr 2844 lea(result, Address(result, tmp, scale1)); 2845 2846 // Make sure string is still long enough 2847 subl(cnt1, tmp); 2848 cmpl(cnt1, cnt2); 2849 if (int_cnt2 == stride) { 2850 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2851 } else { // int_cnt2 > 8 2852 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2853 } 2854 // Left less then substring. 2855 2856 bind(RET_NOT_FOUND); 2857 movl(result, -1); 2858 jmp(EXIT); 2859 2860 if (int_cnt2 > stride) { 2861 // This code is optimized for the case when whole substring 2862 // is matched if its head is matched. 2863 bind(MATCH_SUBSTR_HEAD); 2864 pcmpestri(vec, Address(result, 0), mode); 2865 // Reload only string if does not match 2866 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2867 2868 Label CONT_SCAN_SUBSTR; 2869 // Compare the rest of substring (> 8 chars). 2870 bind(FOUND_SUBSTR); 2871 // First 8 chars are already matched. 2872 negptr(cnt2); 2873 addptr(cnt2, stride); 2874 2875 bind(SCAN_SUBSTR); 2876 subl(cnt1, stride); 2877 cmpl(cnt2, -stride); // Do not read beyond substring 2878 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2879 // Back-up strings to avoid reading beyond substring: 2880 // cnt1 = cnt1 - cnt2 + 8 2881 addl(cnt1, cnt2); // cnt2 is negative 2882 addl(cnt1, stride); 2883 movl(cnt2, stride); negptr(cnt2); 2884 bind(CONT_SCAN_SUBSTR); 2885 if (int_cnt2 < (int)G) { 2886 int tail_off1 = int_cnt2<<scale1; 2887 int tail_off2 = int_cnt2<<scale2; 2888 if (ae == StrIntrinsicNode::UL) { 2889 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2890 } else { 2891 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2892 } 2893 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2894 } else { 2895 // calculate index in register to avoid integer overflow (int_cnt2*2) 2896 movl(tmp, int_cnt2); 2897 addptr(tmp, cnt2); 2898 if (ae == StrIntrinsicNode::UL) { 2899 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2900 } else { 2901 movdqu(vec, Address(str2, tmp, scale2, 0)); 2902 } 2903 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2904 } 2905 // Need to reload strings pointers if not matched whole vector 2906 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2907 addptr(cnt2, stride); 2908 jcc(Assembler::negative, SCAN_SUBSTR); 2909 // Fall through if found full substring 2910 2911 } // (int_cnt2 > 8) 2912 2913 bind(RET_FOUND); 2914 // Found result if we matched full small substring. 2915 // Compute substr offset 2916 subptr(result, str1); 2917 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2918 shrl(result, 1); // index 2919 } 2920 bind(EXIT); 2921 2922 } // string_indexofC8 2923 2924 // Small strings are loaded through stack if they cross page boundary. 2925 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2926 Register cnt1, Register cnt2, 2927 int int_cnt2, Register result, 2928 XMMRegister vec, Register tmp, 2929 int ae) { 2930 ShortBranchVerifier sbv(this); 2931 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2932 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2933 2934 // 2935 // int_cnt2 is length of small (< 8 chars) constant substring 2936 // or (-1) for non constant substring in which case its length 2937 // is in cnt2 register. 2938 // 2939 // Note, inline_string_indexOf() generates checks: 2940 // if (substr.count > string.count) return -1; 2941 // if (substr.count == 0) return 0; 2942 // 2943 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2944 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2945 // This method uses the pcmpestri instruction with bound registers 2946 // inputs: 2947 // xmm - substring 2948 // rax - substring length (elements count) 2949 // mem - scanned string 2950 // rdx - string length (elements count) 2951 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2952 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2953 // outputs: 2954 // rcx - matched index in string 2955 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2956 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2957 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2958 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2959 2960 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2961 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2962 FOUND_CANDIDATE; 2963 2964 { //======================================================== 2965 // We don't know where these strings are located 2966 // and we can't read beyond them. Load them through stack. 2967 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2968 2969 movptr(tmp, rsp); // save old SP 2970 2971 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2972 if (int_cnt2 == (1>>scale2)) { // One byte 2973 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2974 load_unsigned_byte(result, Address(str2, 0)); 2975 movdl(vec, result); // move 32 bits 2976 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2977 // Not enough header space in 32-bit VM: 12+3 = 15. 2978 movl(result, Address(str2, -1)); 2979 shrl(result, 8); 2980 movdl(vec, result); // move 32 bits 2981 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2982 load_unsigned_short(result, Address(str2, 0)); 2983 movdl(vec, result); // move 32 bits 2984 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2985 movdl(vec, Address(str2, 0)); // move 32 bits 2986 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2987 movq(vec, Address(str2, 0)); // move 64 bits 2988 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2989 // Array header size is 12 bytes in 32-bit VM 2990 // + 6 bytes for 3 chars == 18 bytes, 2991 // enough space to load vec and shift. 2992 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2993 if (ae == StrIntrinsicNode::UL) { 2994 int tail_off = int_cnt2-8; 2995 pmovzxbw(vec, Address(str2, tail_off)); 2996 psrldq(vec, -2*tail_off); 2997 } 2998 else { 2999 int tail_off = int_cnt2*(1<<scale2); 3000 movdqu(vec, Address(str2, tail_off-16)); 3001 psrldq(vec, 16-tail_off); 3002 } 3003 } 3004 } else { // not constant substring 3005 cmpl(cnt2, stride); 3006 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3007 3008 // We can read beyond string if srt+16 does not cross page boundary 3009 // since heaps are aligned and mapped by pages. 3010 assert(os::vm_page_size() < (int)G, "default page should be small"); 3011 movl(result, str2); // We need only low 32 bits 3012 andl(result, ((int)os::vm_page_size()-1)); 3013 cmpl(result, ((int)os::vm_page_size()-16)); 3014 jccb(Assembler::belowEqual, CHECK_STR); 3015 3016 // Move small strings to stack to allow load 16 bytes into vec. 3017 subptr(rsp, 16); 3018 int stk_offset = wordSize-(1<<scale2); 3019 push(cnt2); 3020 3021 bind(COPY_SUBSTR); 3022 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3023 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3024 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3025 } else if (ae == StrIntrinsicNode::UU) { 3026 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3027 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3028 } 3029 decrement(cnt2); 3030 jccb(Assembler::notZero, COPY_SUBSTR); 3031 3032 pop(cnt2); 3033 movptr(str2, rsp); // New substring address 3034 } // non constant 3035 3036 bind(CHECK_STR); 3037 cmpl(cnt1, stride); 3038 jccb(Assembler::aboveEqual, BIG_STRINGS); 3039 3040 // Check cross page boundary. 3041 movl(result, str1); // We need only low 32 bits 3042 andl(result, ((int)os::vm_page_size()-1)); 3043 cmpl(result, ((int)os::vm_page_size()-16)); 3044 jccb(Assembler::belowEqual, BIG_STRINGS); 3045 3046 subptr(rsp, 16); 3047 int stk_offset = -(1<<scale1); 3048 if (int_cnt2 < 0) { // not constant 3049 push(cnt2); 3050 stk_offset += wordSize; 3051 } 3052 movl(cnt2, cnt1); 3053 3054 bind(COPY_STR); 3055 if (ae == StrIntrinsicNode::LL) { 3056 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3057 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3058 } else { 3059 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3060 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3061 } 3062 decrement(cnt2); 3063 jccb(Assembler::notZero, COPY_STR); 3064 3065 if (int_cnt2 < 0) { // not constant 3066 pop(cnt2); 3067 } 3068 movptr(str1, rsp); // New string address 3069 3070 bind(BIG_STRINGS); 3071 // Load substring. 3072 if (int_cnt2 < 0) { // -1 3073 if (ae == StrIntrinsicNode::UL) { 3074 pmovzxbw(vec, Address(str2, 0)); 3075 } else { 3076 movdqu(vec, Address(str2, 0)); 3077 } 3078 push(cnt2); // substr count 3079 push(str2); // substr addr 3080 push(str1); // string addr 3081 } else { 3082 // Small (< 8 chars) constant substrings are loaded already. 3083 movl(cnt2, int_cnt2); 3084 } 3085 push(tmp); // original SP 3086 3087 } // Finished loading 3088 3089 //======================================================== 3090 // Start search 3091 // 3092 3093 movptr(result, str1); // string addr 3094 3095 if (int_cnt2 < 0) { // Only for non constant substring 3096 jmpb(SCAN_TO_SUBSTR); 3097 3098 // SP saved at sp+0 3099 // String saved at sp+1*wordSize 3100 // Substr saved at sp+2*wordSize 3101 // Substr count saved at sp+3*wordSize 3102 3103 // Reload substr for rescan, this code 3104 // is executed only for large substrings (> 8 chars) 3105 bind(RELOAD_SUBSTR); 3106 movptr(str2, Address(rsp, 2*wordSize)); 3107 movl(cnt2, Address(rsp, 3*wordSize)); 3108 if (ae == StrIntrinsicNode::UL) { 3109 pmovzxbw(vec, Address(str2, 0)); 3110 } else { 3111 movdqu(vec, Address(str2, 0)); 3112 } 3113 // We came here after the beginning of the substring was 3114 // matched but the rest of it was not so we need to search 3115 // again. Start from the next element after the previous match. 3116 subptr(str1, result); // Restore counter 3117 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3118 shrl(str1, 1); 3119 } 3120 addl(cnt1, str1); 3121 decrementl(cnt1); // Shift to next element 3122 cmpl(cnt1, cnt2); 3123 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3124 3125 addptr(result, (1<<scale1)); 3126 } // non constant 3127 3128 // Scan string for start of substr in 16-byte vectors 3129 bind(SCAN_TO_SUBSTR); 3130 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3131 pcmpestri(vec, Address(result, 0), mode); 3132 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3133 subl(cnt1, stride); 3134 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3135 cmpl(cnt1, cnt2); 3136 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3137 addptr(result, 16); 3138 3139 bind(ADJUST_STR); 3140 cmpl(cnt1, stride); // Do not read beyond string 3141 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3142 // Back-up string to avoid reading beyond string. 3143 lea(result, Address(result, cnt1, scale1, -16)); 3144 movl(cnt1, stride); 3145 jmpb(SCAN_TO_SUBSTR); 3146 3147 // Found a potential substr 3148 bind(FOUND_CANDIDATE); 3149 // After pcmpestri tmp(rcx) contains matched element index 3150 3151 // Make sure string is still long enough 3152 subl(cnt1, tmp); 3153 cmpl(cnt1, cnt2); 3154 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3155 // Left less then substring. 3156 3157 bind(RET_NOT_FOUND); 3158 movl(result, -1); 3159 jmp(CLEANUP); 3160 3161 bind(FOUND_SUBSTR); 3162 // Compute start addr of substr 3163 lea(result, Address(result, tmp, scale1)); 3164 if (int_cnt2 > 0) { // Constant substring 3165 // Repeat search for small substring (< 8 chars) 3166 // from new point without reloading substring. 3167 // Have to check that we don't read beyond string. 3168 cmpl(tmp, stride-int_cnt2); 3169 jccb(Assembler::greater, ADJUST_STR); 3170 // Fall through if matched whole substring. 3171 } else { // non constant 3172 assert(int_cnt2 == -1, "should be != 0"); 3173 3174 addl(tmp, cnt2); 3175 // Found result if we matched whole substring. 3176 cmpl(tmp, stride); 3177 jcc(Assembler::lessEqual, RET_FOUND); 3178 3179 // Repeat search for small substring (<= 8 chars) 3180 // from new point 'str1' without reloading substring. 3181 cmpl(cnt2, stride); 3182 // Have to check that we don't read beyond string. 3183 jccb(Assembler::lessEqual, ADJUST_STR); 3184 3185 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3186 // Compare the rest of substring (> 8 chars). 3187 movptr(str1, result); 3188 3189 cmpl(tmp, cnt2); 3190 // First 8 chars are already matched. 3191 jccb(Assembler::equal, CHECK_NEXT); 3192 3193 bind(SCAN_SUBSTR); 3194 pcmpestri(vec, Address(str1, 0), mode); 3195 // Need to reload strings pointers if not matched whole vector 3196 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3197 3198 bind(CHECK_NEXT); 3199 subl(cnt2, stride); 3200 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3201 addptr(str1, 16); 3202 if (ae == StrIntrinsicNode::UL) { 3203 addptr(str2, 8); 3204 } else { 3205 addptr(str2, 16); 3206 } 3207 subl(cnt1, stride); 3208 cmpl(cnt2, stride); // Do not read beyond substring 3209 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3210 // Back-up strings to avoid reading beyond substring. 3211 3212 if (ae == StrIntrinsicNode::UL) { 3213 lea(str2, Address(str2, cnt2, scale2, -8)); 3214 lea(str1, Address(str1, cnt2, scale1, -16)); 3215 } else { 3216 lea(str2, Address(str2, cnt2, scale2, -16)); 3217 lea(str1, Address(str1, cnt2, scale1, -16)); 3218 } 3219 subl(cnt1, cnt2); 3220 movl(cnt2, stride); 3221 addl(cnt1, stride); 3222 bind(CONT_SCAN_SUBSTR); 3223 if (ae == StrIntrinsicNode::UL) { 3224 pmovzxbw(vec, Address(str2, 0)); 3225 } else { 3226 movdqu(vec, Address(str2, 0)); 3227 } 3228 jmp(SCAN_SUBSTR); 3229 3230 bind(RET_FOUND_LONG); 3231 movptr(str1, Address(rsp, wordSize)); 3232 } // non constant 3233 3234 bind(RET_FOUND); 3235 // Compute substr offset 3236 subptr(result, str1); 3237 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3238 shrl(result, 1); // index 3239 } 3240 bind(CLEANUP); 3241 pop(rsp); // restore SP 3242 3243 } // string_indexof 3244 3245 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3246 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3247 ShortBranchVerifier sbv(this); 3248 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3249 3250 int stride = 8; 3251 3252 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3253 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3254 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3255 FOUND_SEQ_CHAR, DONE_LABEL; 3256 3257 movptr(result, str1); 3258 if (UseAVX >= 2) { 3259 cmpl(cnt1, stride); 3260 jcc(Assembler::less, SCAN_TO_CHAR); 3261 cmpl(cnt1, 2*stride); 3262 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3263 movdl(vec1, ch); 3264 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3265 vpxor(vec2, vec2); 3266 movl(tmp, cnt1); 3267 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3268 andl(cnt1,0x0000000F); //tail count (in chars) 3269 3270 bind(SCAN_TO_16_CHAR_LOOP); 3271 vmovdqu(vec3, Address(result, 0)); 3272 vpcmpeqw(vec3, vec3, vec1, 1); 3273 vptest(vec2, vec3); 3274 jcc(Assembler::carryClear, FOUND_CHAR); 3275 addptr(result, 32); 3276 subl(tmp, 2*stride); 3277 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3278 jmp(SCAN_TO_8_CHAR); 3279 bind(SCAN_TO_8_CHAR_INIT); 3280 movdl(vec1, ch); 3281 pshuflw(vec1, vec1, 0x00); 3282 pshufd(vec1, vec1, 0); 3283 pxor(vec2, vec2); 3284 } 3285 bind(SCAN_TO_8_CHAR); 3286 cmpl(cnt1, stride); 3287 jcc(Assembler::less, SCAN_TO_CHAR); 3288 if (UseAVX < 2) { 3289 movdl(vec1, ch); 3290 pshuflw(vec1, vec1, 0x00); 3291 pshufd(vec1, vec1, 0); 3292 pxor(vec2, vec2); 3293 } 3294 movl(tmp, cnt1); 3295 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3296 andl(cnt1,0x00000007); //tail count (in chars) 3297 3298 bind(SCAN_TO_8_CHAR_LOOP); 3299 movdqu(vec3, Address(result, 0)); 3300 pcmpeqw(vec3, vec1); 3301 ptest(vec2, vec3); 3302 jcc(Assembler::carryClear, FOUND_CHAR); 3303 addptr(result, 16); 3304 subl(tmp, stride); 3305 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3306 bind(SCAN_TO_CHAR); 3307 testl(cnt1, cnt1); 3308 jcc(Assembler::zero, RET_NOT_FOUND); 3309 bind(SCAN_TO_CHAR_LOOP); 3310 load_unsigned_short(tmp, Address(result, 0)); 3311 cmpl(ch, tmp); 3312 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3313 addptr(result, 2); 3314 subl(cnt1, 1); 3315 jccb(Assembler::zero, RET_NOT_FOUND); 3316 jmp(SCAN_TO_CHAR_LOOP); 3317 3318 bind(RET_NOT_FOUND); 3319 movl(result, -1); 3320 jmpb(DONE_LABEL); 3321 3322 bind(FOUND_CHAR); 3323 if (UseAVX >= 2) { 3324 vpmovmskb(tmp, vec3); 3325 } else { 3326 pmovmskb(tmp, vec3); 3327 } 3328 bsfl(ch, tmp); 3329 addptr(result, ch); 3330 3331 bind(FOUND_SEQ_CHAR); 3332 subptr(result, str1); 3333 shrl(result, 1); 3334 3335 bind(DONE_LABEL); 3336 } // string_indexof_char 3337 3338 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3339 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3340 ShortBranchVerifier sbv(this); 3341 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3342 3343 int stride = 16; 3344 3345 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3346 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3347 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3348 FOUND_SEQ_CHAR, DONE_LABEL; 3349 3350 movptr(result, str1); 3351 if (UseAVX >= 2) { 3352 cmpl(cnt1, stride); 3353 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3354 cmpl(cnt1, stride*2); 3355 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3356 movdl(vec1, ch); 3357 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3358 vpxor(vec2, vec2); 3359 movl(tmp, cnt1); 3360 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3361 andl(cnt1,0x0000001F); //tail count (in chars) 3362 3363 bind(SCAN_TO_32_CHAR_LOOP); 3364 vmovdqu(vec3, Address(result, 0)); 3365 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3366 vptest(vec2, vec3); 3367 jcc(Assembler::carryClear, FOUND_CHAR); 3368 addptr(result, 32); 3369 subl(tmp, stride*2); 3370 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3371 jmp(SCAN_TO_16_CHAR); 3372 3373 bind(SCAN_TO_16_CHAR_INIT); 3374 movdl(vec1, ch); 3375 pxor(vec2, vec2); 3376 pshufb(vec1, vec2); 3377 } 3378 3379 bind(SCAN_TO_16_CHAR); 3380 cmpl(cnt1, stride); 3381 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3382 if (UseAVX < 2) { 3383 movdl(vec1, ch); 3384 pxor(vec2, vec2); 3385 pshufb(vec1, vec2); 3386 } 3387 movl(tmp, cnt1); 3388 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3389 andl(cnt1,0x0000000F); //tail count (in bytes) 3390 3391 bind(SCAN_TO_16_CHAR_LOOP); 3392 movdqu(vec3, Address(result, 0)); 3393 pcmpeqb(vec3, vec1); 3394 ptest(vec2, vec3); 3395 jcc(Assembler::carryClear, FOUND_CHAR); 3396 addptr(result, 16); 3397 subl(tmp, stride); 3398 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3399 3400 bind(SCAN_TO_CHAR_INIT); 3401 testl(cnt1, cnt1); 3402 jcc(Assembler::zero, RET_NOT_FOUND); 3403 bind(SCAN_TO_CHAR_LOOP); 3404 load_unsigned_byte(tmp, Address(result, 0)); 3405 cmpl(ch, tmp); 3406 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3407 addptr(result, 1); 3408 subl(cnt1, 1); 3409 jccb(Assembler::zero, RET_NOT_FOUND); 3410 jmp(SCAN_TO_CHAR_LOOP); 3411 3412 bind(RET_NOT_FOUND); 3413 movl(result, -1); 3414 jmpb(DONE_LABEL); 3415 3416 bind(FOUND_CHAR); 3417 if (UseAVX >= 2) { 3418 vpmovmskb(tmp, vec3); 3419 } else { 3420 pmovmskb(tmp, vec3); 3421 } 3422 bsfl(ch, tmp); 3423 addptr(result, ch); 3424 3425 bind(FOUND_SEQ_CHAR); 3426 subptr(result, str1); 3427 3428 bind(DONE_LABEL); 3429 } // stringL_indexof_char 3430 3431 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3432 switch (eltype) { 3433 case T_BOOLEAN: return sizeof(jboolean); 3434 case T_BYTE: return sizeof(jbyte); 3435 case T_SHORT: return sizeof(jshort); 3436 case T_CHAR: return sizeof(jchar); 3437 case T_INT: return sizeof(jint); 3438 default: 3439 ShouldNotReachHere(); 3440 return -1; 3441 } 3442 } 3443 3444 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3445 switch (eltype) { 3446 // T_BOOLEAN used as surrogate for unsigned byte 3447 case T_BOOLEAN: movzbl(dst, src); break; 3448 case T_BYTE: movsbl(dst, src); break; 3449 case T_SHORT: movswl(dst, src); break; 3450 case T_CHAR: movzwl(dst, src); break; 3451 case T_INT: movl(dst, src); break; 3452 default: 3453 ShouldNotReachHere(); 3454 } 3455 } 3456 3457 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3458 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3459 } 3460 3461 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3462 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3463 } 3464 3465 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3466 const int vlen = Assembler::AVX_256bit; 3467 switch (eltype) { 3468 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3469 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3470 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3471 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3472 case T_INT: 3473 // do nothing 3474 break; 3475 default: 3476 ShouldNotReachHere(); 3477 } 3478 } 3479 3480 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3481 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3482 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3483 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3484 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3485 BasicType eltype) { 3486 ShortBranchVerifier sbv(this); 3487 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3488 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3489 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3490 3491 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3492 SHORT_UNROLLED_LOOP_EXIT, 3493 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3494 UNROLLED_VECTOR_LOOP_BEGIN, 3495 END; 3496 switch (eltype) { 3497 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3498 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3499 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3500 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3501 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3502 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3503 } 3504 3505 // For "renaming" for readibility of the code 3506 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3507 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3508 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3509 3510 const int elsize = arrays_hashcode_elsize(eltype); 3511 3512 /* 3513 if (cnt1 >= 2) { 3514 if (cnt1 >= 32) { 3515 UNROLLED VECTOR LOOP 3516 } 3517 UNROLLED SCALAR LOOP 3518 } 3519 SINGLE SCALAR 3520 */ 3521 3522 cmpl(cnt1, 32); 3523 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3524 3525 // cnt1 >= 32 && generate_vectorized_loop 3526 xorl(index, index); 3527 3528 // vresult = IntVector.zero(I256); 3529 for (int idx = 0; idx < 4; idx++) { 3530 vpxor(vresult[idx], vresult[idx]); 3531 } 3532 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3533 Register bound = tmp2; 3534 Register next = tmp3; 3535 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3536 movl(next, Address(tmp2, 0)); 3537 movdl(vnext, next); 3538 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3539 3540 // index = 0; 3541 // bound = cnt1 & ~(32 - 1); 3542 movl(bound, cnt1); 3543 andl(bound, ~(32 - 1)); 3544 // for (; index < bound; index += 32) { 3545 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3546 // result *= next; 3547 imull(result, next); 3548 // loop fission to upfront the cost of fetching from memory, OOO execution 3549 // can then hopefully do a better job of prefetching 3550 for (int idx = 0; idx < 4; idx++) { 3551 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3552 } 3553 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3554 for (int idx = 0; idx < 4; idx++) { 3555 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3556 arrays_hashcode_elvcast(vtmp[idx], eltype); 3557 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3558 } 3559 // index += 32; 3560 addl(index, 32); 3561 // index < bound; 3562 cmpl(index, bound); 3563 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3564 // } 3565 3566 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3567 subl(cnt1, bound); 3568 // release bound 3569 3570 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3571 for (int idx = 0; idx < 4; idx++) { 3572 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3573 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3574 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3575 } 3576 // result += vresult.reduceLanes(ADD); 3577 for (int idx = 0; idx < 4; idx++) { 3578 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3579 } 3580 3581 // } else if (cnt1 < 32) { 3582 3583 bind(SHORT_UNROLLED_BEGIN); 3584 // int i = 1; 3585 movl(index, 1); 3586 cmpl(index, cnt1); 3587 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3588 3589 // for (; i < cnt1 ; i += 2) { 3590 bind(SHORT_UNROLLED_LOOP_BEGIN); 3591 movl(tmp3, 961); 3592 imull(result, tmp3); 3593 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3594 movl(tmp3, tmp2); 3595 shll(tmp3, 5); 3596 subl(tmp3, tmp2); 3597 addl(result, tmp3); 3598 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3599 addl(result, tmp3); 3600 addl(index, 2); 3601 cmpl(index, cnt1); 3602 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3603 3604 // } 3605 // if (i >= cnt1) { 3606 bind(SHORT_UNROLLED_LOOP_EXIT); 3607 jccb(Assembler::greater, END); 3608 movl(tmp2, result); 3609 shll(result, 5); 3610 subl(result, tmp2); 3611 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3612 addl(result, tmp3); 3613 // } 3614 bind(END); 3615 3616 BLOCK_COMMENT("} // arrays_hashcode"); 3617 3618 } // arrays_hashcode 3619 3620 // helper function for string_compare 3621 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3622 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3623 Address::ScaleFactor scale2, Register index, int ae) { 3624 if (ae == StrIntrinsicNode::LL) { 3625 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3626 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3627 } else if (ae == StrIntrinsicNode::UU) { 3628 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3629 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3630 } else { 3631 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3632 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3633 } 3634 } 3635 3636 // Compare strings, used for char[] and byte[]. 3637 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3638 Register cnt1, Register cnt2, Register result, 3639 XMMRegister vec1, int ae, KRegister mask) { 3640 ShortBranchVerifier sbv(this); 3641 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3642 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3 3643 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3644 int stride2x2 = 0x40; 3645 Address::ScaleFactor scale = Address::no_scale; 3646 Address::ScaleFactor scale1 = Address::no_scale; 3647 Address::ScaleFactor scale2 = Address::no_scale; 3648 3649 if (ae != StrIntrinsicNode::LL) { 3650 stride2x2 = 0x20; 3651 } 3652 3653 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3654 shrl(cnt2, 1); 3655 } 3656 // Compute the minimum of the string lengths and the 3657 // difference of the string lengths (stack). 3658 // Do the conditional move stuff 3659 movl(result, cnt1); 3660 subl(cnt1, cnt2); 3661 push(cnt1); 3662 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3663 3664 // Is the minimum length zero? 3665 testl(cnt2, cnt2); 3666 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3667 if (ae == StrIntrinsicNode::LL) { 3668 // Load first bytes 3669 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3670 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3671 } else if (ae == StrIntrinsicNode::UU) { 3672 // Load first characters 3673 load_unsigned_short(result, Address(str1, 0)); 3674 load_unsigned_short(cnt1, Address(str2, 0)); 3675 } else { 3676 load_unsigned_byte(result, Address(str1, 0)); 3677 load_unsigned_short(cnt1, Address(str2, 0)); 3678 } 3679 subl(result, cnt1); 3680 jcc(Assembler::notZero, POP_LABEL); 3681 3682 if (ae == StrIntrinsicNode::UU) { 3683 // Divide length by 2 to get number of chars 3684 shrl(cnt2, 1); 3685 } 3686 cmpl(cnt2, 1); 3687 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3688 3689 // Check if the strings start at the same location and setup scale and stride 3690 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3691 cmpptr(str1, str2); 3692 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3693 if (ae == StrIntrinsicNode::LL) { 3694 scale = Address::times_1; 3695 stride = 16; 3696 } else { 3697 scale = Address::times_2; 3698 stride = 8; 3699 } 3700 } else { 3701 scale1 = Address::times_1; 3702 scale2 = Address::times_2; 3703 // scale not used 3704 stride = 8; 3705 } 3706 3707 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3708 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3709 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3710 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3711 Label COMPARE_TAIL_LONG; 3712 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3 3713 3714 int pcmpmask = 0x19; 3715 if (ae == StrIntrinsicNode::LL) { 3716 pcmpmask &= ~0x01; 3717 } 3718 3719 // Setup to compare 16-chars (32-bytes) vectors, 3720 // start from first character again because it has aligned address. 3721 if (ae == StrIntrinsicNode::LL) { 3722 stride2 = 32; 3723 } else { 3724 stride2 = 16; 3725 } 3726 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3727 adr_stride = stride << scale; 3728 } else { 3729 adr_stride1 = 8; //stride << scale1; 3730 adr_stride2 = 16; //stride << scale2; 3731 } 3732 3733 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3734 // rax and rdx are used by pcmpestri as elements counters 3735 movl(result, cnt2); 3736 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3737 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3738 3739 // fast path : compare first 2 8-char vectors. 3740 bind(COMPARE_16_CHARS); 3741 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3742 movdqu(vec1, Address(str1, 0)); 3743 } else { 3744 pmovzxbw(vec1, Address(str1, 0)); 3745 } 3746 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3747 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3748 3749 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3750 movdqu(vec1, Address(str1, adr_stride)); 3751 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3752 } else { 3753 pmovzxbw(vec1, Address(str1, adr_stride1)); 3754 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3755 } 3756 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3757 addl(cnt1, stride); 3758 3759 // Compare the characters at index in cnt1 3760 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3761 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3762 subl(result, cnt2); 3763 jmp(POP_LABEL); 3764 3765 // Setup the registers to start vector comparison loop 3766 bind(COMPARE_WIDE_VECTORS); 3767 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3768 lea(str1, Address(str1, result, scale)); 3769 lea(str2, Address(str2, result, scale)); 3770 } else { 3771 lea(str1, Address(str1, result, scale1)); 3772 lea(str2, Address(str2, result, scale2)); 3773 } 3774 subl(result, stride2); 3775 subl(cnt2, stride2); 3776 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3777 negptr(result); 3778 3779 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3780 bind(COMPARE_WIDE_VECTORS_LOOP); 3781 3782 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3783 cmpl(cnt2, stride2x2); 3784 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3785 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3786 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3787 3788 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3789 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3790 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3791 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3792 } else { 3793 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3794 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3795 } 3796 kortestql(mask, mask); 3797 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3798 addptr(result, stride2x2); // update since we already compared at this addr 3799 subl(cnt2, stride2x2); // and sub the size too 3800 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3801 3802 vpxor(vec1, vec1); 3803 jmpb(COMPARE_WIDE_TAIL); 3804 }//if (VM_Version::supports_avx512vlbw()) 3805 3806 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3807 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3808 vmovdqu(vec1, Address(str1, result, scale)); 3809 vpxor(vec1, Address(str2, result, scale)); 3810 } else { 3811 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3812 vpxor(vec1, Address(str2, result, scale2)); 3813 } 3814 vptest(vec1, vec1); 3815 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3816 addptr(result, stride2); 3817 subl(cnt2, stride2); 3818 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3819 // clean upper bits of YMM registers 3820 vpxor(vec1, vec1); 3821 3822 // compare wide vectors tail 3823 bind(COMPARE_WIDE_TAIL); 3824 testptr(result, result); 3825 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3826 3827 movl(result, stride2); 3828 movl(cnt2, result); 3829 negptr(result); 3830 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3831 3832 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3833 bind(VECTOR_NOT_EQUAL); 3834 // clean upper bits of YMM registers 3835 vpxor(vec1, vec1); 3836 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3837 lea(str1, Address(str1, result, scale)); 3838 lea(str2, Address(str2, result, scale)); 3839 } else { 3840 lea(str1, Address(str1, result, scale1)); 3841 lea(str2, Address(str2, result, scale2)); 3842 } 3843 jmp(COMPARE_16_CHARS); 3844 3845 // Compare tail chars, length between 1 to 15 chars 3846 bind(COMPARE_TAIL_LONG); 3847 movl(cnt2, result); 3848 cmpl(cnt2, stride); 3849 jcc(Assembler::less, COMPARE_SMALL_STR); 3850 3851 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3852 movdqu(vec1, Address(str1, 0)); 3853 } else { 3854 pmovzxbw(vec1, Address(str1, 0)); 3855 } 3856 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3857 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3858 subptr(cnt2, stride); 3859 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3860 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3861 lea(str1, Address(str1, result, scale)); 3862 lea(str2, Address(str2, result, scale)); 3863 } else { 3864 lea(str1, Address(str1, result, scale1)); 3865 lea(str2, Address(str2, result, scale2)); 3866 } 3867 negptr(cnt2); 3868 jmpb(WHILE_HEAD_LABEL); 3869 3870 bind(COMPARE_SMALL_STR); 3871 } else if (UseSSE42Intrinsics) { 3872 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3873 int pcmpmask = 0x19; 3874 // Setup to compare 8-char (16-byte) vectors, 3875 // start from first character again because it has aligned address. 3876 movl(result, cnt2); 3877 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3878 if (ae == StrIntrinsicNode::LL) { 3879 pcmpmask &= ~0x01; 3880 } 3881 jcc(Assembler::zero, COMPARE_TAIL); 3882 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3883 lea(str1, Address(str1, result, scale)); 3884 lea(str2, Address(str2, result, scale)); 3885 } else { 3886 lea(str1, Address(str1, result, scale1)); 3887 lea(str2, Address(str2, result, scale2)); 3888 } 3889 negptr(result); 3890 3891 // pcmpestri 3892 // inputs: 3893 // vec1- substring 3894 // rax - negative string length (elements count) 3895 // mem - scanned string 3896 // rdx - string length (elements count) 3897 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3898 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3899 // outputs: 3900 // rcx - first mismatched element index 3901 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3902 3903 bind(COMPARE_WIDE_VECTORS); 3904 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3905 movdqu(vec1, Address(str1, result, scale)); 3906 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3907 } else { 3908 pmovzxbw(vec1, Address(str1, result, scale1)); 3909 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3910 } 3911 // After pcmpestri cnt1(rcx) contains mismatched element index 3912 3913 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3914 addptr(result, stride); 3915 subptr(cnt2, stride); 3916 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3917 3918 // compare wide vectors tail 3919 testptr(result, result); 3920 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3921 3922 movl(cnt2, stride); 3923 movl(result, stride); 3924 negptr(result); 3925 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3926 movdqu(vec1, Address(str1, result, scale)); 3927 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3928 } else { 3929 pmovzxbw(vec1, Address(str1, result, scale1)); 3930 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3931 } 3932 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3933 3934 // Mismatched characters in the vectors 3935 bind(VECTOR_NOT_EQUAL); 3936 addptr(cnt1, result); 3937 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3938 subl(result, cnt2); 3939 jmpb(POP_LABEL); 3940 3941 bind(COMPARE_TAIL); // limit is zero 3942 movl(cnt2, result); 3943 // Fallthru to tail compare 3944 } 3945 // Shift str2 and str1 to the end of the arrays, negate min 3946 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3947 lea(str1, Address(str1, cnt2, scale)); 3948 lea(str2, Address(str2, cnt2, scale)); 3949 } else { 3950 lea(str1, Address(str1, cnt2, scale1)); 3951 lea(str2, Address(str2, cnt2, scale2)); 3952 } 3953 decrementl(cnt2); // first character was compared already 3954 negptr(cnt2); 3955 3956 // Compare the rest of the elements 3957 bind(WHILE_HEAD_LABEL); 3958 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3959 subl(result, cnt1); 3960 jccb(Assembler::notZero, POP_LABEL); 3961 increment(cnt2); 3962 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3963 3964 // Strings are equal up to min length. Return the length difference. 3965 bind(LENGTH_DIFF_LABEL); 3966 pop(result); 3967 if (ae == StrIntrinsicNode::UU) { 3968 // Divide diff by 2 to get number of chars 3969 sarl(result, 1); 3970 } 3971 jmpb(DONE_LABEL); 3972 3973 if (VM_Version::supports_avx512vlbw()) { 3974 3975 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3976 3977 kmovql(cnt1, mask); 3978 notq(cnt1); 3979 bsfq(cnt2, cnt1); 3980 if (ae != StrIntrinsicNode::LL) { 3981 // Divide diff by 2 to get number of chars 3982 sarl(cnt2, 1); 3983 } 3984 addq(result, cnt2); 3985 if (ae == StrIntrinsicNode::LL) { 3986 load_unsigned_byte(cnt1, Address(str2, result)); 3987 load_unsigned_byte(result, Address(str1, result)); 3988 } else if (ae == StrIntrinsicNode::UU) { 3989 load_unsigned_short(cnt1, Address(str2, result, scale)); 3990 load_unsigned_short(result, Address(str1, result, scale)); 3991 } else { 3992 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3993 load_unsigned_byte(result, Address(str1, result, scale1)); 3994 } 3995 subl(result, cnt1); 3996 jmpb(POP_LABEL); 3997 }//if (VM_Version::supports_avx512vlbw()) 3998 3999 // Discard the stored length difference 4000 bind(POP_LABEL); 4001 pop(cnt1); 4002 4003 // That's it 4004 bind(DONE_LABEL); 4005 if(ae == StrIntrinsicNode::UL) { 4006 negl(result); 4007 } 4008 4009 } 4010 4011 // Search for Non-ASCII character (Negative byte value) in a byte array, 4012 // return the index of the first such character, otherwise the length 4013 // of the array segment searched. 4014 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4015 // @IntrinsicCandidate 4016 // public static int countPositives(byte[] ba, int off, int len) { 4017 // for (int i = off; i < off + len; i++) { 4018 // if (ba[i] < 0) { 4019 // return i - off; 4020 // } 4021 // } 4022 // return len; 4023 // } 4024 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4025 Register result, Register tmp1, 4026 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4027 // rsi: byte array 4028 // rcx: len 4029 // rax: result 4030 ShortBranchVerifier sbv(this); 4031 assert_different_registers(ary1, len, result, tmp1); 4032 assert_different_registers(vec1, vec2); 4033 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4034 4035 movl(result, len); // copy 4036 // len == 0 4037 testl(len, len); 4038 jcc(Assembler::zero, DONE); 4039 4040 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4041 VM_Version::supports_avx512vlbw() && 4042 VM_Version::supports_bmi2()) { 4043 4044 Label test_64_loop, test_tail, BREAK_LOOP; 4045 movl(tmp1, len); 4046 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4047 4048 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4049 andl(len, 0xffffffc0); // vector count (in chars) 4050 jccb(Assembler::zero, test_tail); 4051 4052 lea(ary1, Address(ary1, len, Address::times_1)); 4053 negptr(len); 4054 4055 bind(test_64_loop); 4056 // Check whether our 64 elements of size byte contain negatives 4057 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4058 kortestql(mask1, mask1); 4059 jcc(Assembler::notZero, BREAK_LOOP); 4060 4061 addptr(len, 64); 4062 jccb(Assembler::notZero, test_64_loop); 4063 4064 bind(test_tail); 4065 // bail out when there is nothing to be done 4066 testl(tmp1, -1); 4067 jcc(Assembler::zero, DONE); 4068 4069 4070 // check the tail for absense of negatives 4071 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4072 { 4073 Register tmp3_aliased = len; 4074 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4075 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4076 notq(tmp3_aliased); 4077 kmovql(mask2, tmp3_aliased); 4078 } 4079 4080 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4081 ktestq(mask1, mask2); 4082 jcc(Assembler::zero, DONE); 4083 4084 // do a full check for negative registers in the tail 4085 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4086 // ary1 already pointing to the right place 4087 jmpb(TAIL_START); 4088 4089 bind(BREAK_LOOP); 4090 // At least one byte in the last 64 byte block was negative. 4091 // Set up to look at the last 64 bytes as if they were a tail 4092 lea(ary1, Address(ary1, len, Address::times_1)); 4093 addptr(result, len); 4094 // Ignore the very last byte: if all others are positive, 4095 // it must be negative, so we can skip right to the 2+1 byte 4096 // end comparison at this point 4097 orl(result, 63); 4098 movl(len, 63); 4099 // Fallthru to tail compare 4100 } else { 4101 4102 if (UseAVX >= 2) { 4103 // With AVX2, use 32-byte vector compare 4104 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4105 4106 // Compare 32-byte vectors 4107 testl(len, 0xffffffe0); // vector count (in bytes) 4108 jccb(Assembler::zero, TAIL_START); 4109 4110 andl(len, 0xffffffe0); 4111 lea(ary1, Address(ary1, len, Address::times_1)); 4112 negptr(len); 4113 4114 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4115 movdl(vec2, tmp1); 4116 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4117 4118 bind(COMPARE_WIDE_VECTORS); 4119 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4120 vptest(vec1, vec2); 4121 jccb(Assembler::notZero, BREAK_LOOP); 4122 addptr(len, 32); 4123 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4124 4125 testl(result, 0x0000001f); // any bytes remaining? 4126 jcc(Assembler::zero, DONE); 4127 4128 // Quick test using the already prepared vector mask 4129 movl(len, result); 4130 andl(len, 0x0000001f); 4131 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4132 vptest(vec1, vec2); 4133 jcc(Assembler::zero, DONE); 4134 // There are zeros, jump to the tail to determine exactly where 4135 jmpb(TAIL_START); 4136 4137 bind(BREAK_LOOP); 4138 // At least one byte in the last 32-byte vector is negative. 4139 // Set up to look at the last 32 bytes as if they were a tail 4140 lea(ary1, Address(ary1, len, Address::times_1)); 4141 addptr(result, len); 4142 // Ignore the very last byte: if all others are positive, 4143 // it must be negative, so we can skip right to the 2+1 byte 4144 // end comparison at this point 4145 orl(result, 31); 4146 movl(len, 31); 4147 // Fallthru to tail compare 4148 } else if (UseSSE42Intrinsics) { 4149 // With SSE4.2, use double quad vector compare 4150 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4151 4152 // Compare 16-byte vectors 4153 testl(len, 0xfffffff0); // vector count (in bytes) 4154 jcc(Assembler::zero, TAIL_START); 4155 4156 andl(len, 0xfffffff0); 4157 lea(ary1, Address(ary1, len, Address::times_1)); 4158 negptr(len); 4159 4160 movl(tmp1, 0x80808080); 4161 movdl(vec2, tmp1); 4162 pshufd(vec2, vec2, 0); 4163 4164 bind(COMPARE_WIDE_VECTORS); 4165 movdqu(vec1, Address(ary1, len, Address::times_1)); 4166 ptest(vec1, vec2); 4167 jccb(Assembler::notZero, BREAK_LOOP); 4168 addptr(len, 16); 4169 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4170 4171 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4172 jcc(Assembler::zero, DONE); 4173 4174 // Quick test using the already prepared vector mask 4175 movl(len, result); 4176 andl(len, 0x0000000f); // tail count (in bytes) 4177 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4178 ptest(vec1, vec2); 4179 jcc(Assembler::zero, DONE); 4180 jmpb(TAIL_START); 4181 4182 bind(BREAK_LOOP); 4183 // At least one byte in the last 16-byte vector is negative. 4184 // Set up and look at the last 16 bytes as if they were a tail 4185 lea(ary1, Address(ary1, len, Address::times_1)); 4186 addptr(result, len); 4187 // Ignore the very last byte: if all others are positive, 4188 // it must be negative, so we can skip right to the 2+1 byte 4189 // end comparison at this point 4190 orl(result, 15); 4191 movl(len, 15); 4192 // Fallthru to tail compare 4193 } 4194 } 4195 4196 bind(TAIL_START); 4197 // Compare 4-byte vectors 4198 andl(len, 0xfffffffc); // vector count (in bytes) 4199 jccb(Assembler::zero, COMPARE_CHAR); 4200 4201 lea(ary1, Address(ary1, len, Address::times_1)); 4202 negptr(len); 4203 4204 bind(COMPARE_VECTORS); 4205 movl(tmp1, Address(ary1, len, Address::times_1)); 4206 andl(tmp1, 0x80808080); 4207 jccb(Assembler::notZero, TAIL_ADJUST); 4208 addptr(len, 4); 4209 jccb(Assembler::notZero, COMPARE_VECTORS); 4210 4211 // Compare trailing char (final 2-3 bytes), if any 4212 bind(COMPARE_CHAR); 4213 4214 testl(result, 0x2); // tail char 4215 jccb(Assembler::zero, COMPARE_BYTE); 4216 load_unsigned_short(tmp1, Address(ary1, 0)); 4217 andl(tmp1, 0x00008080); 4218 jccb(Assembler::notZero, CHAR_ADJUST); 4219 lea(ary1, Address(ary1, 2)); 4220 4221 bind(COMPARE_BYTE); 4222 testl(result, 0x1); // tail byte 4223 jccb(Assembler::zero, DONE); 4224 load_unsigned_byte(tmp1, Address(ary1, 0)); 4225 testl(tmp1, 0x00000080); 4226 jccb(Assembler::zero, DONE); 4227 subptr(result, 1); 4228 jmpb(DONE); 4229 4230 bind(TAIL_ADJUST); 4231 // there are negative bits in the last 4 byte block. 4232 // Adjust result and check the next three bytes 4233 addptr(result, len); 4234 orl(result, 3); 4235 lea(ary1, Address(ary1, len, Address::times_1)); 4236 jmpb(COMPARE_CHAR); 4237 4238 bind(CHAR_ADJUST); 4239 // We are looking at a char + optional byte tail, and found that one 4240 // of the bytes in the char is negative. Adjust the result, check the 4241 // first byte and readjust if needed. 4242 andl(result, 0xfffffffc); 4243 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4244 jccb(Assembler::notZero, DONE); 4245 addptr(result, 1); 4246 4247 // That's it 4248 bind(DONE); 4249 if (UseAVX >= 2) { 4250 // clean upper bits of YMM registers 4251 vpxor(vec1, vec1); 4252 vpxor(vec2, vec2); 4253 } 4254 } 4255 4256 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4257 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4258 Register limit, Register result, Register chr, 4259 XMMRegister vec1, XMMRegister vec2, bool is_char, 4260 KRegister mask, bool expand_ary2) { 4261 // for expand_ary2, limit is the (smaller) size of the second array. 4262 ShortBranchVerifier sbv(this); 4263 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4264 4265 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4266 "Expansion only implemented for AVX2"); 4267 4268 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4269 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4270 4271 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4272 int scaleIncr = expand_ary2 ? 8 : 16; 4273 4274 if (is_array_equ) { 4275 // Check the input args 4276 cmpoop(ary1, ary2); 4277 jcc(Assembler::equal, TRUE_LABEL); 4278 4279 // Need additional checks for arrays_equals. 4280 testptr(ary1, ary1); 4281 jcc(Assembler::zero, FALSE_LABEL); 4282 testptr(ary2, ary2); 4283 jcc(Assembler::zero, FALSE_LABEL); 4284 4285 // Check the lengths 4286 movl(limit, Address(ary1, length_offset)); 4287 cmpl(limit, Address(ary2, length_offset)); 4288 jcc(Assembler::notEqual, FALSE_LABEL); 4289 } 4290 4291 // count == 0 4292 testl(limit, limit); 4293 jcc(Assembler::zero, TRUE_LABEL); 4294 4295 if (is_array_equ) { 4296 // Load array address 4297 lea(ary1, Address(ary1, base_offset)); 4298 lea(ary2, Address(ary2, base_offset)); 4299 } 4300 4301 if (is_array_equ && is_char) { 4302 // arrays_equals when used for char[]. 4303 shll(limit, 1); // byte count != 0 4304 } 4305 movl(result, limit); // copy 4306 4307 if (UseAVX >= 2) { 4308 // With AVX2, use 32-byte vector compare 4309 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4310 4311 // Compare 32-byte vectors 4312 if (expand_ary2) { 4313 andl(result, 0x0000000f); // tail count (in bytes) 4314 andl(limit, 0xfffffff0); // vector count (in bytes) 4315 jcc(Assembler::zero, COMPARE_TAIL); 4316 } else { 4317 andl(result, 0x0000001f); // tail count (in bytes) 4318 andl(limit, 0xffffffe0); // vector count (in bytes) 4319 jcc(Assembler::zero, COMPARE_TAIL_16); 4320 } 4321 4322 lea(ary1, Address(ary1, limit, scaleFactor)); 4323 lea(ary2, Address(ary2, limit, Address::times_1)); 4324 negptr(limit); 4325 4326 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4327 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4328 4329 cmpl(limit, -64); 4330 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4331 4332 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4333 4334 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4335 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4336 kortestql(mask, mask); 4337 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4338 addptr(limit, 64); // update since we already compared at this addr 4339 cmpl(limit, -64); 4340 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4341 4342 // At this point we may still need to compare -limit+result bytes. 4343 // We could execute the next two instruction and just continue via non-wide path: 4344 // cmpl(limit, 0); 4345 // jcc(Assembler::equal, COMPARE_TAIL); // true 4346 // But since we stopped at the points ary{1,2}+limit which are 4347 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4348 // (|limit| <= 32 and result < 32), 4349 // we may just compare the last 64 bytes. 4350 // 4351 addptr(result, -64); // it is safe, bc we just came from this area 4352 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4353 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4354 kortestql(mask, mask); 4355 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4356 4357 jmp(TRUE_LABEL); 4358 4359 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4360 4361 }//if (VM_Version::supports_avx512vlbw()) 4362 4363 bind(COMPARE_WIDE_VECTORS); 4364 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4365 if (expand_ary2) { 4366 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4367 } else { 4368 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4369 } 4370 vpxor(vec1, vec2); 4371 4372 vptest(vec1, vec1); 4373 jcc(Assembler::notZero, FALSE_LABEL); 4374 addptr(limit, scaleIncr * 2); 4375 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4376 4377 testl(result, result); 4378 jcc(Assembler::zero, TRUE_LABEL); 4379 4380 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4381 if (expand_ary2) { 4382 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4383 } else { 4384 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4385 } 4386 vpxor(vec1, vec2); 4387 4388 vptest(vec1, vec1); 4389 jcc(Assembler::notZero, FALSE_LABEL); 4390 jmp(TRUE_LABEL); 4391 4392 bind(COMPARE_TAIL_16); // limit is zero 4393 movl(limit, result); 4394 4395 // Compare 16-byte chunks 4396 andl(result, 0x0000000f); // tail count (in bytes) 4397 andl(limit, 0xfffffff0); // vector count (in bytes) 4398 jcc(Assembler::zero, COMPARE_TAIL); 4399 4400 lea(ary1, Address(ary1, limit, scaleFactor)); 4401 lea(ary2, Address(ary2, limit, Address::times_1)); 4402 negptr(limit); 4403 4404 bind(COMPARE_WIDE_VECTORS_16); 4405 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4406 if (expand_ary2) { 4407 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4408 } else { 4409 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4410 } 4411 pxor(vec1, vec2); 4412 4413 ptest(vec1, vec1); 4414 jcc(Assembler::notZero, FALSE_LABEL); 4415 addptr(limit, scaleIncr); 4416 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4417 4418 bind(COMPARE_TAIL); // limit is zero 4419 movl(limit, result); 4420 // Fallthru to tail compare 4421 } else if (UseSSE42Intrinsics) { 4422 // With SSE4.2, use double quad vector compare 4423 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4424 4425 // Compare 16-byte vectors 4426 andl(result, 0x0000000f); // tail count (in bytes) 4427 andl(limit, 0xfffffff0); // vector count (in bytes) 4428 jcc(Assembler::zero, COMPARE_TAIL); 4429 4430 lea(ary1, Address(ary1, limit, Address::times_1)); 4431 lea(ary2, Address(ary2, limit, Address::times_1)); 4432 negptr(limit); 4433 4434 bind(COMPARE_WIDE_VECTORS); 4435 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4436 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4437 pxor(vec1, vec2); 4438 4439 ptest(vec1, vec1); 4440 jcc(Assembler::notZero, FALSE_LABEL); 4441 addptr(limit, 16); 4442 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4443 4444 testl(result, result); 4445 jcc(Assembler::zero, TRUE_LABEL); 4446 4447 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4448 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4449 pxor(vec1, vec2); 4450 4451 ptest(vec1, vec1); 4452 jccb(Assembler::notZero, FALSE_LABEL); 4453 jmpb(TRUE_LABEL); 4454 4455 bind(COMPARE_TAIL); // limit is zero 4456 movl(limit, result); 4457 // Fallthru to tail compare 4458 } 4459 4460 // Compare 4-byte vectors 4461 if (expand_ary2) { 4462 testl(result, result); 4463 jccb(Assembler::zero, TRUE_LABEL); 4464 } else { 4465 andl(limit, 0xfffffffc); // vector count (in bytes) 4466 jccb(Assembler::zero, COMPARE_CHAR); 4467 } 4468 4469 lea(ary1, Address(ary1, limit, scaleFactor)); 4470 lea(ary2, Address(ary2, limit, Address::times_1)); 4471 negptr(limit); 4472 4473 bind(COMPARE_VECTORS); 4474 if (expand_ary2) { 4475 // There are no "vector" operations for bytes to shorts 4476 movzbl(chr, Address(ary2, limit, Address::times_1)); 4477 cmpw(Address(ary1, limit, Address::times_2), chr); 4478 jccb(Assembler::notEqual, FALSE_LABEL); 4479 addptr(limit, 1); 4480 jcc(Assembler::notZero, COMPARE_VECTORS); 4481 jmp(TRUE_LABEL); 4482 } else { 4483 movl(chr, Address(ary1, limit, Address::times_1)); 4484 cmpl(chr, Address(ary2, limit, Address::times_1)); 4485 jccb(Assembler::notEqual, FALSE_LABEL); 4486 addptr(limit, 4); 4487 jcc(Assembler::notZero, COMPARE_VECTORS); 4488 } 4489 4490 // Compare trailing char (final 2 bytes), if any 4491 bind(COMPARE_CHAR); 4492 testl(result, 0x2); // tail char 4493 jccb(Assembler::zero, COMPARE_BYTE); 4494 load_unsigned_short(chr, Address(ary1, 0)); 4495 load_unsigned_short(limit, Address(ary2, 0)); 4496 cmpl(chr, limit); 4497 jccb(Assembler::notEqual, FALSE_LABEL); 4498 4499 if (is_array_equ && is_char) { 4500 bind(COMPARE_BYTE); 4501 } else { 4502 lea(ary1, Address(ary1, 2)); 4503 lea(ary2, Address(ary2, 2)); 4504 4505 bind(COMPARE_BYTE); 4506 testl(result, 0x1); // tail byte 4507 jccb(Assembler::zero, TRUE_LABEL); 4508 load_unsigned_byte(chr, Address(ary1, 0)); 4509 load_unsigned_byte(limit, Address(ary2, 0)); 4510 cmpl(chr, limit); 4511 jccb(Assembler::notEqual, FALSE_LABEL); 4512 } 4513 bind(TRUE_LABEL); 4514 movl(result, 1); // return true 4515 jmpb(DONE); 4516 4517 bind(FALSE_LABEL); 4518 xorl(result, result); // return false 4519 4520 // That's it 4521 bind(DONE); 4522 if (UseAVX >= 2) { 4523 // clean upper bits of YMM registers 4524 vpxor(vec1, vec1); 4525 vpxor(vec2, vec2); 4526 } 4527 } 4528 4529 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4530 #define __ masm. 4531 Register dst = stub.data<0>(); 4532 XMMRegister src = stub.data<1>(); 4533 address target = stub.data<2>(); 4534 __ bind(stub.entry()); 4535 __ subptr(rsp, 8); 4536 __ movdbl(Address(rsp), src); 4537 __ call(RuntimeAddress(target)); 4538 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte. 4539 __ pop(dst); 4540 __ jmp(stub.continuation()); 4541 #undef __ 4542 } 4543 4544 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4545 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4546 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4547 4548 address slowpath_target; 4549 if (dst_bt == T_INT) { 4550 if (src_bt == T_FLOAT) { 4551 cvttss2sil(dst, src); 4552 cmpl(dst, 0x80000000); 4553 slowpath_target = StubRoutines::x86::f2i_fixup(); 4554 } else { 4555 cvttsd2sil(dst, src); 4556 cmpl(dst, 0x80000000); 4557 slowpath_target = StubRoutines::x86::d2i_fixup(); 4558 } 4559 } else { 4560 if (src_bt == T_FLOAT) { 4561 cvttss2siq(dst, src); 4562 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4563 slowpath_target = StubRoutines::x86::f2l_fixup(); 4564 } else { 4565 cvttsd2siq(dst, src); 4566 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4567 slowpath_target = StubRoutines::x86::d2l_fixup(); 4568 } 4569 } 4570 4571 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte. 4572 int max_size = 23 + (UseAPX ? 1 : 0); 4573 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath); 4574 jcc(Assembler::equal, stub->entry()); 4575 bind(stub->continuation()); 4576 } 4577 4578 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4579 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4580 switch(ideal_opc) { 4581 case Op_LShiftVS: 4582 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4583 case Op_LShiftVI: 4584 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4585 case Op_LShiftVL: 4586 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4587 case Op_RShiftVS: 4588 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4589 case Op_RShiftVI: 4590 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4591 case Op_RShiftVL: 4592 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4593 case Op_URShiftVS: 4594 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4595 case Op_URShiftVI: 4596 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4597 case Op_URShiftVL: 4598 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4599 case Op_RotateRightV: 4600 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4601 case Op_RotateLeftV: 4602 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4603 default: 4604 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4605 break; 4606 } 4607 } 4608 4609 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4610 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4611 if (is_unsigned) { 4612 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4613 } else { 4614 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4615 } 4616 } 4617 4618 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4619 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4620 switch (elem_bt) { 4621 case T_BYTE: 4622 if (ideal_opc == Op_SaturatingAddV) { 4623 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4624 } else { 4625 assert(ideal_opc == Op_SaturatingSubV, ""); 4626 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4627 } 4628 break; 4629 case T_SHORT: 4630 if (ideal_opc == Op_SaturatingAddV) { 4631 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4632 } else { 4633 assert(ideal_opc == Op_SaturatingSubV, ""); 4634 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4635 } 4636 break; 4637 default: 4638 fatal("Unsupported type %s", type2name(elem_bt)); 4639 break; 4640 } 4641 } 4642 4643 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4644 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4645 switch (elem_bt) { 4646 case T_BYTE: 4647 if (ideal_opc == Op_SaturatingAddV) { 4648 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4649 } else { 4650 assert(ideal_opc == Op_SaturatingSubV, ""); 4651 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4652 } 4653 break; 4654 case T_SHORT: 4655 if (ideal_opc == Op_SaturatingAddV) { 4656 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4657 } else { 4658 assert(ideal_opc == Op_SaturatingSubV, ""); 4659 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4660 } 4661 break; 4662 default: 4663 fatal("Unsupported type %s", type2name(elem_bt)); 4664 break; 4665 } 4666 } 4667 4668 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4669 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4670 if (is_unsigned) { 4671 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4672 } else { 4673 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4674 } 4675 } 4676 4677 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4678 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4679 switch (elem_bt) { 4680 case T_BYTE: 4681 if (ideal_opc == Op_SaturatingAddV) { 4682 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4683 } else { 4684 assert(ideal_opc == Op_SaturatingSubV, ""); 4685 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4686 } 4687 break; 4688 case T_SHORT: 4689 if (ideal_opc == Op_SaturatingAddV) { 4690 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4691 } else { 4692 assert(ideal_opc == Op_SaturatingSubV, ""); 4693 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4694 } 4695 break; 4696 default: 4697 fatal("Unsupported type %s", type2name(elem_bt)); 4698 break; 4699 } 4700 } 4701 4702 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4703 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4704 switch (elem_bt) { 4705 case T_BYTE: 4706 if (ideal_opc == Op_SaturatingAddV) { 4707 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4708 } else { 4709 assert(ideal_opc == Op_SaturatingSubV, ""); 4710 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4711 } 4712 break; 4713 case T_SHORT: 4714 if (ideal_opc == Op_SaturatingAddV) { 4715 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4716 } else { 4717 assert(ideal_opc == Op_SaturatingSubV, ""); 4718 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4719 } 4720 break; 4721 default: 4722 fatal("Unsupported type %s", type2name(elem_bt)); 4723 break; 4724 } 4725 } 4726 4727 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4728 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4729 bool is_varshift) { 4730 switch (ideal_opc) { 4731 case Op_AddVB: 4732 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4733 case Op_AddVS: 4734 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4735 case Op_AddVI: 4736 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4737 case Op_AddVL: 4738 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4739 case Op_AddVF: 4740 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4741 case Op_AddVD: 4742 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4743 case Op_SubVB: 4744 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4745 case Op_SubVS: 4746 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4747 case Op_SubVI: 4748 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4749 case Op_SubVL: 4750 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4751 case Op_SubVF: 4752 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4753 case Op_SubVD: 4754 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4755 case Op_MulVS: 4756 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4757 case Op_MulVI: 4758 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4759 case Op_MulVL: 4760 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4761 case Op_MulVF: 4762 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4763 case Op_MulVD: 4764 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4765 case Op_DivVF: 4766 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4767 case Op_DivVD: 4768 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4769 case Op_SqrtVF: 4770 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4771 case Op_SqrtVD: 4772 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4773 case Op_AbsVB: 4774 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4775 case Op_AbsVS: 4776 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4777 case Op_AbsVI: 4778 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4779 case Op_AbsVL: 4780 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4781 case Op_FmaVF: 4782 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4783 case Op_FmaVD: 4784 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4785 case Op_VectorRearrange: 4786 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4787 case Op_LShiftVS: 4788 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4789 case Op_LShiftVI: 4790 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4791 case Op_LShiftVL: 4792 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4793 case Op_RShiftVS: 4794 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4795 case Op_RShiftVI: 4796 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4797 case Op_RShiftVL: 4798 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4799 case Op_URShiftVS: 4800 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4801 case Op_URShiftVI: 4802 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4803 case Op_URShiftVL: 4804 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4805 case Op_RotateLeftV: 4806 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4807 case Op_RotateRightV: 4808 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4809 case Op_MaxV: 4810 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4811 case Op_MinV: 4812 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4813 case Op_UMinV: 4814 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4815 case Op_UMaxV: 4816 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4817 case Op_XorV: 4818 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4819 case Op_OrV: 4820 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4821 case Op_AndV: 4822 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4823 default: 4824 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4825 break; 4826 } 4827 } 4828 4829 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4830 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4831 switch (ideal_opc) { 4832 case Op_AddVB: 4833 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4834 case Op_AddVS: 4835 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4836 case Op_AddVI: 4837 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4838 case Op_AddVL: 4839 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4840 case Op_AddVF: 4841 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4842 case Op_AddVD: 4843 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4844 case Op_SubVB: 4845 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4846 case Op_SubVS: 4847 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4848 case Op_SubVI: 4849 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4850 case Op_SubVL: 4851 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4852 case Op_SubVF: 4853 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4854 case Op_SubVD: 4855 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4856 case Op_MulVS: 4857 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4858 case Op_MulVI: 4859 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4860 case Op_MulVL: 4861 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4862 case Op_MulVF: 4863 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4864 case Op_MulVD: 4865 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4866 case Op_DivVF: 4867 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4868 case Op_DivVD: 4869 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4870 case Op_FmaVF: 4871 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4872 case Op_FmaVD: 4873 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4874 case Op_MaxV: 4875 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4876 case Op_MinV: 4877 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4878 case Op_UMaxV: 4879 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4880 case Op_UMinV: 4881 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4882 case Op_XorV: 4883 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4884 case Op_OrV: 4885 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4886 case Op_AndV: 4887 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4888 default: 4889 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4890 break; 4891 } 4892 } 4893 4894 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4895 KRegister src1, KRegister src2) { 4896 BasicType etype = T_ILLEGAL; 4897 switch(mask_len) { 4898 case 2: 4899 case 4: 4900 case 8: etype = T_BYTE; break; 4901 case 16: etype = T_SHORT; break; 4902 case 32: etype = T_INT; break; 4903 case 64: etype = T_LONG; break; 4904 default: fatal("Unsupported type"); break; 4905 } 4906 assert(etype != T_ILLEGAL, ""); 4907 switch(ideal_opc) { 4908 case Op_AndVMask: 4909 kand(etype, dst, src1, src2); break; 4910 case Op_OrVMask: 4911 kor(etype, dst, src1, src2); break; 4912 case Op_XorVMask: 4913 kxor(etype, dst, src1, src2); break; 4914 default: 4915 fatal("Unsupported masked operation"); break; 4916 } 4917 } 4918 4919 /* 4920 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4921 * If src is NaN, the result is 0. 4922 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4923 * the result is equal to the value of Integer.MIN_VALUE. 4924 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4925 * the result is equal to the value of Integer.MAX_VALUE. 4926 */ 4927 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4928 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4929 Register rscratch, AddressLiteral float_sign_flip, 4930 int vec_enc) { 4931 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4932 Label done; 4933 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4934 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4935 vptest(xtmp2, xtmp2, vec_enc); 4936 jccb(Assembler::equal, done); 4937 4938 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4939 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4940 4941 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4942 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4943 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4944 4945 // Recompute the mask for remaining special value. 4946 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4947 // Extract SRC values corresponding to TRUE mask lanes. 4948 vpand(xtmp4, xtmp2, src, vec_enc); 4949 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4950 // values are set. 4951 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4952 4953 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4954 bind(done); 4955 } 4956 4957 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4958 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4959 Register rscratch, AddressLiteral float_sign_flip, 4960 int vec_enc) { 4961 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4962 Label done; 4963 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4964 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4965 kortestwl(ktmp1, ktmp1); 4966 jccb(Assembler::equal, done); 4967 4968 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4969 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4970 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4971 4972 kxorwl(ktmp1, ktmp1, ktmp2); 4973 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4974 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4975 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4976 bind(done); 4977 } 4978 4979 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4980 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4981 Register rscratch, AddressLiteral double_sign_flip, 4982 int vec_enc) { 4983 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4984 4985 Label done; 4986 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4987 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4988 kortestwl(ktmp1, ktmp1); 4989 jccb(Assembler::equal, done); 4990 4991 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4992 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4993 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4994 4995 kxorwl(ktmp1, ktmp1, ktmp2); 4996 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4997 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4998 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4999 bind(done); 5000 } 5001 5002 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5003 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5004 Register rscratch, AddressLiteral float_sign_flip, 5005 int vec_enc) { 5006 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5007 Label done; 5008 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5009 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5010 kortestwl(ktmp1, ktmp1); 5011 jccb(Assembler::equal, done); 5012 5013 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5014 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5015 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5016 5017 kxorwl(ktmp1, ktmp1, ktmp2); 5018 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5019 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5020 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5021 bind(done); 5022 } 5023 5024 /* 5025 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5026 * If src is NaN, the result is 0. 5027 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5028 * the result is equal to the value of Long.MIN_VALUE. 5029 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5030 * the result is equal to the value of Long.MAX_VALUE. 5031 */ 5032 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5033 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5034 Register rscratch, AddressLiteral double_sign_flip, 5035 int vec_enc) { 5036 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5037 5038 Label done; 5039 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5040 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5041 kortestwl(ktmp1, ktmp1); 5042 jccb(Assembler::equal, done); 5043 5044 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5045 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5046 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5047 5048 kxorwl(ktmp1, ktmp1, ktmp2); 5049 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5050 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5051 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5052 bind(done); 5053 } 5054 5055 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5056 XMMRegister xtmp, int index, int vec_enc) { 5057 assert(vec_enc < Assembler::AVX_512bit, ""); 5058 if (vec_enc == Assembler::AVX_256bit) { 5059 vextractf128_high(xtmp, src); 5060 vshufps(dst, src, xtmp, index, vec_enc); 5061 } else { 5062 vshufps(dst, src, zero, index, vec_enc); 5063 } 5064 } 5065 5066 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5067 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5068 AddressLiteral float_sign_flip, int src_vec_enc) { 5069 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5070 5071 Label done; 5072 // Compare the destination lanes with float_sign_flip 5073 // value to get mask for all special values. 5074 movdqu(xtmp1, float_sign_flip, rscratch); 5075 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5076 ptest(xtmp2, xtmp2); 5077 jccb(Assembler::equal, done); 5078 5079 // Flip float_sign_flip to get max integer value. 5080 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5081 pxor(xtmp1, xtmp4); 5082 5083 // Set detination lanes corresponding to unordered source lanes as zero. 5084 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5085 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5086 5087 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5088 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5089 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5090 5091 // Recompute the mask for remaining special value. 5092 pxor(xtmp2, xtmp3); 5093 // Extract mask corresponding to non-negative source lanes. 5094 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5095 5096 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5097 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5098 pand(xtmp3, xtmp2); 5099 5100 // Replace destination lanes holding special value(0x80000000) with max int 5101 // if corresponding source lane holds a +ve value. 5102 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5103 bind(done); 5104 } 5105 5106 5107 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5108 XMMRegister xtmp, Register rscratch, int vec_enc) { 5109 switch(to_elem_bt) { 5110 case T_SHORT: 5111 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5112 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5113 vpackusdw(dst, dst, zero, vec_enc); 5114 if (vec_enc == Assembler::AVX_256bit) { 5115 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5116 } 5117 break; 5118 case T_BYTE: 5119 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5120 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5121 vpackusdw(dst, dst, zero, vec_enc); 5122 if (vec_enc == Assembler::AVX_256bit) { 5123 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5124 } 5125 vpackuswb(dst, dst, zero, vec_enc); 5126 break; 5127 default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt)); 5128 } 5129 } 5130 5131 /* 5132 * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):- 5133 * a) Perform vector D2L/F2I cast. 5134 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5135 * It signifies that source value could be any of the special floating point 5136 * values(NaN,-Inf,Inf,Max,-Min). 5137 * c) Set destination to zero if source is NaN value. 5138 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5139 */ 5140 5141 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5142 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5143 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5144 int to_elem_sz = type2aelembytes(to_elem_bt); 5145 assert(to_elem_sz <= 4, ""); 5146 vcvttps2dq(dst, src, vec_enc); 5147 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5148 if (to_elem_sz < 4) { 5149 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5150 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5151 } 5152 } 5153 5154 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5155 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5156 Register rscratch, int vec_enc) { 5157 int to_elem_sz = type2aelembytes(to_elem_bt); 5158 assert(to_elem_sz <= 4, ""); 5159 vcvttps2dq(dst, src, vec_enc); 5160 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5161 switch(to_elem_bt) { 5162 case T_INT: 5163 break; 5164 case T_SHORT: 5165 evpmovdw(dst, dst, vec_enc); 5166 break; 5167 case T_BYTE: 5168 evpmovdb(dst, dst, vec_enc); 5169 break; 5170 default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt)); 5171 } 5172 } 5173 5174 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5175 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5176 Register rscratch, int vec_enc) { 5177 evcvttps2qq(dst, src, vec_enc); 5178 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5179 } 5180 5181 // Handling for downcasting from double to integer or sub-word types on AVX2. 5182 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5183 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5184 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5185 int to_elem_sz = type2aelembytes(to_elem_bt); 5186 assert(to_elem_sz < 8, ""); 5187 vcvttpd2dq(dst, src, vec_enc); 5188 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5189 float_sign_flip, vec_enc); 5190 if (to_elem_sz < 4) { 5191 // xtmp4 holds all zero lanes. 5192 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5193 } 5194 } 5195 5196 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5197 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5198 KRegister ktmp2, AddressLiteral sign_flip, 5199 Register rscratch, int vec_enc) { 5200 if (VM_Version::supports_avx512dq()) { 5201 evcvttpd2qq(dst, src, vec_enc); 5202 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5203 switch(to_elem_bt) { 5204 case T_LONG: 5205 break; 5206 case T_INT: 5207 evpmovsqd(dst, dst, vec_enc); 5208 break; 5209 case T_SHORT: 5210 evpmovsqd(dst, dst, vec_enc); 5211 evpmovdw(dst, dst, vec_enc); 5212 break; 5213 case T_BYTE: 5214 evpmovsqd(dst, dst, vec_enc); 5215 evpmovdb(dst, dst, vec_enc); 5216 break; 5217 default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt)); 5218 } 5219 } else { 5220 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5221 vcvttpd2dq(dst, src, vec_enc); 5222 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5223 switch(to_elem_bt) { 5224 case T_INT: 5225 break; 5226 case T_SHORT: 5227 evpmovdw(dst, dst, vec_enc); 5228 break; 5229 case T_BYTE: 5230 evpmovdb(dst, dst, vec_enc); 5231 break; 5232 default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt)); 5233 } 5234 } 5235 } 5236 5237 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5238 switch(to_elem_bt) { 5239 case T_LONG: 5240 evcvttps2qqs(dst, src, vec_enc); 5241 break; 5242 case T_INT: 5243 evcvttps2dqs(dst, src, vec_enc); 5244 break; 5245 case T_SHORT: 5246 evcvttps2dqs(dst, src, vec_enc); 5247 evpmovdw(dst, dst, vec_enc); 5248 break; 5249 case T_BYTE: 5250 evcvttps2dqs(dst, src, vec_enc); 5251 evpmovdb(dst, dst, vec_enc); 5252 break; 5253 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt)); 5254 } 5255 } 5256 5257 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) { 5258 switch(to_elem_bt) { 5259 case T_LONG: 5260 evcvttps2qqs(dst, src, vec_enc); 5261 break; 5262 case T_INT: 5263 evcvttps2dqs(dst, src, vec_enc); 5264 break; 5265 case T_SHORT: 5266 evcvttps2dqs(dst, src, vec_enc); 5267 evpmovdw(dst, dst, vec_enc); 5268 break; 5269 case T_BYTE: 5270 evcvttps2dqs(dst, src, vec_enc); 5271 evpmovdb(dst, dst, vec_enc); 5272 break; 5273 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt)); 5274 } 5275 } 5276 5277 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5278 switch(to_elem_bt) { 5279 case T_LONG: 5280 evcvttpd2qqs(dst, src, vec_enc); 5281 break; 5282 case T_INT: 5283 evcvttpd2dqs(dst, src, vec_enc); 5284 break; 5285 case T_SHORT: 5286 evcvttpd2dqs(dst, src, vec_enc); 5287 evpmovdw(dst, dst, vec_enc); 5288 break; 5289 case T_BYTE: 5290 evcvttpd2dqs(dst, src, vec_enc); 5291 evpmovdb(dst, dst, vec_enc); 5292 break; 5293 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt)); 5294 } 5295 } 5296 5297 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) { 5298 switch(to_elem_bt) { 5299 case T_LONG: 5300 evcvttpd2qqs(dst, src, vec_enc); 5301 break; 5302 case T_INT: 5303 evcvttpd2dqs(dst, src, vec_enc); 5304 break; 5305 case T_SHORT: 5306 evcvttpd2dqs(dst, src, vec_enc); 5307 evpmovdw(dst, dst, vec_enc); 5308 break; 5309 case T_BYTE: 5310 evcvttpd2dqs(dst, src, vec_enc); 5311 evpmovdb(dst, dst, vec_enc); 5312 break; 5313 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt)); 5314 } 5315 } 5316 5317 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5318 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5319 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5320 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5321 // and re-instantiate original MXCSR.RC mode after that. 5322 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5323 5324 mov64(tmp, julong_cast(0.5L)); 5325 evpbroadcastq(xtmp1, tmp, vec_enc); 5326 vaddpd(xtmp1, src , xtmp1, vec_enc); 5327 evcvtpd2qq(dst, xtmp1, vec_enc); 5328 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5329 double_sign_flip, vec_enc);; 5330 5331 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5332 } 5333 5334 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5335 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5336 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5337 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5338 // and re-instantiate original MXCSR.RC mode after that. 5339 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5340 5341 movl(tmp, jint_cast(0.5)); 5342 movq(xtmp1, tmp); 5343 vbroadcastss(xtmp1, xtmp1, vec_enc); 5344 vaddps(xtmp1, src , xtmp1, vec_enc); 5345 vcvtps2dq(dst, xtmp1, vec_enc); 5346 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5347 float_sign_flip, vec_enc); 5348 5349 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5350 } 5351 5352 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5353 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5354 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5355 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5356 // and re-instantiate original MXCSR.RC mode after that. 5357 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5358 5359 movl(tmp, jint_cast(0.5)); 5360 movq(xtmp1, tmp); 5361 vbroadcastss(xtmp1, xtmp1, vec_enc); 5362 vaddps(xtmp1, src , xtmp1, vec_enc); 5363 vcvtps2dq(dst, xtmp1, vec_enc); 5364 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5365 5366 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5367 } 5368 5369 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5370 BasicType from_elem_bt, BasicType to_elem_bt) { 5371 switch (from_elem_bt) { 5372 case T_BYTE: 5373 switch (to_elem_bt) { 5374 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5375 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5376 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5377 default: ShouldNotReachHere(); 5378 } 5379 break; 5380 case T_SHORT: 5381 switch (to_elem_bt) { 5382 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5383 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5384 default: ShouldNotReachHere(); 5385 } 5386 break; 5387 case T_INT: 5388 assert(to_elem_bt == T_LONG, ""); 5389 vpmovzxdq(dst, src, vlen_enc); 5390 break; 5391 default: 5392 ShouldNotReachHere(); 5393 } 5394 } 5395 5396 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5397 BasicType from_elem_bt, BasicType to_elem_bt) { 5398 switch (from_elem_bt) { 5399 case T_BYTE: 5400 switch (to_elem_bt) { 5401 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5402 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5403 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5404 default: ShouldNotReachHere(); 5405 } 5406 break; 5407 case T_SHORT: 5408 switch (to_elem_bt) { 5409 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5410 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5411 default: ShouldNotReachHere(); 5412 } 5413 break; 5414 case T_INT: 5415 assert(to_elem_bt == T_LONG, ""); 5416 vpmovsxdq(dst, src, vlen_enc); 5417 break; 5418 default: 5419 ShouldNotReachHere(); 5420 } 5421 } 5422 5423 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5424 BasicType dst_bt, BasicType src_bt, int vlen) { 5425 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5426 assert(vlen_enc != AVX_512bit, ""); 5427 5428 int dst_bt_size = type2aelembytes(dst_bt); 5429 int src_bt_size = type2aelembytes(src_bt); 5430 if (dst_bt_size > src_bt_size) { 5431 switch (dst_bt_size / src_bt_size) { 5432 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5433 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5434 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5435 default: ShouldNotReachHere(); 5436 } 5437 } else { 5438 assert(dst_bt_size < src_bt_size, ""); 5439 switch (src_bt_size / dst_bt_size) { 5440 case 2: { 5441 if (vlen_enc == AVX_128bit) { 5442 vpacksswb(dst, src, src, vlen_enc); 5443 } else { 5444 vpacksswb(dst, src, src, vlen_enc); 5445 vpermq(dst, dst, 0x08, vlen_enc); 5446 } 5447 break; 5448 } 5449 case 4: { 5450 if (vlen_enc == AVX_128bit) { 5451 vpackssdw(dst, src, src, vlen_enc); 5452 vpacksswb(dst, dst, dst, vlen_enc); 5453 } else { 5454 vpackssdw(dst, src, src, vlen_enc); 5455 vpermq(dst, dst, 0x08, vlen_enc); 5456 vpacksswb(dst, dst, dst, AVX_128bit); 5457 } 5458 break; 5459 } 5460 case 8: { 5461 if (vlen_enc == AVX_128bit) { 5462 vpshufd(dst, src, 0x08, vlen_enc); 5463 vpackssdw(dst, dst, dst, vlen_enc); 5464 vpacksswb(dst, dst, dst, vlen_enc); 5465 } else { 5466 vpshufd(dst, src, 0x08, vlen_enc); 5467 vpermq(dst, dst, 0x08, vlen_enc); 5468 vpackssdw(dst, dst, dst, AVX_128bit); 5469 vpacksswb(dst, dst, dst, AVX_128bit); 5470 } 5471 break; 5472 } 5473 default: ShouldNotReachHere(); 5474 } 5475 } 5476 } 5477 5478 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5479 bool merge, BasicType bt, int vlen_enc) { 5480 if (bt == T_INT) { 5481 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5482 } else { 5483 assert(bt == T_LONG, ""); 5484 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5485 } 5486 } 5487 5488 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5489 bool merge, BasicType bt, int vlen_enc) { 5490 if (bt == T_INT) { 5491 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5492 } else { 5493 assert(bt == T_LONG, ""); 5494 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5495 } 5496 } 5497 5498 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5499 Register rtmp2, XMMRegister xtmp, int mask_len, 5500 int vec_enc) { 5501 int index = 0; 5502 int vindex = 0; 5503 mov64(rtmp1, 0x0101010101010101L); 5504 pdepq(rtmp1, src, rtmp1); 5505 if (mask_len > 8) { 5506 movq(rtmp2, src); 5507 vpxor(xtmp, xtmp, xtmp, vec_enc); 5508 movq(xtmp, rtmp1); 5509 } 5510 movq(dst, rtmp1); 5511 5512 mask_len -= 8; 5513 while (mask_len > 0) { 5514 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5515 index++; 5516 if ((index % 2) == 0) { 5517 pxor(xtmp, xtmp); 5518 } 5519 mov64(rtmp1, 0x0101010101010101L); 5520 shrq(rtmp2, 8); 5521 pdepq(rtmp1, rtmp2, rtmp1); 5522 pinsrq(xtmp, rtmp1, index % 2); 5523 vindex = index / 2; 5524 if (vindex) { 5525 // Write entire 16 byte vector when both 64 bit 5526 // lanes are update to save redundant instructions. 5527 if (index % 2) { 5528 vinsertf128(dst, dst, xtmp, vindex); 5529 } 5530 } else { 5531 vmovdqu(dst, xtmp); 5532 } 5533 mask_len -= 8; 5534 } 5535 } 5536 5537 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5538 switch(opc) { 5539 case Op_VectorMaskTrueCount: 5540 popcntq(dst, tmp); 5541 break; 5542 case Op_VectorMaskLastTrue: 5543 if (VM_Version::supports_lzcnt()) { 5544 lzcntq(tmp, tmp); 5545 movl(dst, 63); 5546 subl(dst, tmp); 5547 } else { 5548 movl(dst, -1); 5549 bsrq(tmp, tmp); 5550 cmov32(Assembler::notZero, dst, tmp); 5551 } 5552 break; 5553 case Op_VectorMaskFirstTrue: 5554 if (VM_Version::supports_bmi1()) { 5555 if (masklen < 32) { 5556 orl(tmp, 1 << masklen); 5557 tzcntl(dst, tmp); 5558 } else if (masklen == 32) { 5559 tzcntl(dst, tmp); 5560 } else { 5561 assert(masklen == 64, ""); 5562 tzcntq(dst, tmp); 5563 } 5564 } else { 5565 if (masklen < 32) { 5566 orl(tmp, 1 << masklen); 5567 bsfl(dst, tmp); 5568 } else { 5569 assert(masklen == 32 || masklen == 64, ""); 5570 movl(dst, masklen); 5571 if (masklen == 32) { 5572 bsfl(tmp, tmp); 5573 } else { 5574 bsfq(tmp, tmp); 5575 } 5576 cmov32(Assembler::notZero, dst, tmp); 5577 } 5578 } 5579 break; 5580 case Op_VectorMaskToLong: 5581 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5582 break; 5583 default: assert(false, "Unhandled mask operation"); 5584 } 5585 } 5586 5587 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5588 int masklen, int masksize, int vec_enc) { 5589 assert(VM_Version::supports_popcnt(), ""); 5590 5591 if(VM_Version::supports_avx512bw()) { 5592 kmovql(tmp, mask); 5593 } else { 5594 assert(masklen <= 16, ""); 5595 kmovwl(tmp, mask); 5596 } 5597 5598 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5599 // operations needs to be clipped. 5600 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5601 andq(tmp, (1 << masklen) - 1); 5602 } 5603 5604 vector_mask_operation_helper(opc, dst, tmp, masklen); 5605 } 5606 5607 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5608 Register tmp, int masklen, BasicType bt, int vec_enc) { 5609 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5610 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5611 assert(VM_Version::supports_popcnt(), ""); 5612 5613 bool need_clip = false; 5614 switch(bt) { 5615 case T_BOOLEAN: 5616 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5617 vpxor(xtmp, xtmp, xtmp, vec_enc); 5618 vpsubb(xtmp, xtmp, mask, vec_enc); 5619 vpmovmskb(tmp, xtmp, vec_enc); 5620 need_clip = masklen < 16; 5621 break; 5622 case T_BYTE: 5623 vpmovmskb(tmp, mask, vec_enc); 5624 need_clip = masklen < 16; 5625 break; 5626 case T_SHORT: 5627 vpacksswb(xtmp, mask, mask, vec_enc); 5628 if (masklen >= 16) { 5629 vpermpd(xtmp, xtmp, 8, vec_enc); 5630 } 5631 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5632 need_clip = masklen < 16; 5633 break; 5634 case T_INT: 5635 case T_FLOAT: 5636 vmovmskps(tmp, mask, vec_enc); 5637 need_clip = masklen < 4; 5638 break; 5639 case T_LONG: 5640 case T_DOUBLE: 5641 vmovmskpd(tmp, mask, vec_enc); 5642 need_clip = masklen < 2; 5643 break; 5644 default: assert(false, "Unhandled type, %s", type2name(bt)); 5645 } 5646 5647 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5648 // operations needs to be clipped. 5649 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5650 // need_clip implies masklen < 32 5651 andq(tmp, (1 << masklen) - 1); 5652 } 5653 5654 vector_mask_operation_helper(opc, dst, tmp, masklen); 5655 } 5656 5657 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5658 Register rtmp2, int mask_len) { 5659 kmov(rtmp1, src); 5660 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5661 mov64(rtmp2, -1L); 5662 pextq(rtmp2, rtmp2, rtmp1); 5663 kmov(dst, rtmp2); 5664 } 5665 5666 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5667 XMMRegister mask, Register rtmp, Register rscratch, 5668 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5669 int vec_enc) { 5670 assert(type2aelembytes(bt) >= 4, ""); 5671 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5672 address compress_perm_table = nullptr; 5673 address expand_perm_table = nullptr; 5674 if (type2aelembytes(bt) == 8) { 5675 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5676 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5677 vmovmskpd(rtmp, mask, vec_enc); 5678 } else { 5679 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5680 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5681 vmovmskps(rtmp, mask, vec_enc); 5682 } 5683 shlq(rtmp, 5); // for 32 byte permute row. 5684 if (opcode == Op_CompressV) { 5685 lea(rscratch, ExternalAddress(compress_perm_table)); 5686 } else { 5687 lea(rscratch, ExternalAddress(expand_perm_table)); 5688 } 5689 addptr(rtmp, rscratch); 5690 vmovdqu(permv, Address(rtmp)); 5691 vpermps(dst, permv, src, Assembler::AVX_256bit); 5692 vpxor(xtmp, xtmp, xtmp, vec_enc); 5693 // Blend the result with zero vector using permute mask, each column entry 5694 // in a permute table row contains either a valid permute index or a -1 (default) 5695 // value, this can potentially be used as a blending mask after 5696 // compressing/expanding the source vector lanes. 5697 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv); 5698 } 5699 5700 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5701 bool merge, BasicType bt, int vec_enc) { 5702 if (opcode == Op_CompressV) { 5703 switch(bt) { 5704 case T_BYTE: 5705 evpcompressb(dst, mask, src, merge, vec_enc); 5706 break; 5707 case T_CHAR: 5708 case T_SHORT: 5709 evpcompressw(dst, mask, src, merge, vec_enc); 5710 break; 5711 case T_INT: 5712 evpcompressd(dst, mask, src, merge, vec_enc); 5713 break; 5714 case T_FLOAT: 5715 evcompressps(dst, mask, src, merge, vec_enc); 5716 break; 5717 case T_LONG: 5718 evpcompressq(dst, mask, src, merge, vec_enc); 5719 break; 5720 case T_DOUBLE: 5721 evcompresspd(dst, mask, src, merge, vec_enc); 5722 break; 5723 default: 5724 fatal("Unsupported type %s", type2name(bt)); 5725 break; 5726 } 5727 } else { 5728 assert(opcode == Op_ExpandV, ""); 5729 switch(bt) { 5730 case T_BYTE: 5731 evpexpandb(dst, mask, src, merge, vec_enc); 5732 break; 5733 case T_CHAR: 5734 case T_SHORT: 5735 evpexpandw(dst, mask, src, merge, vec_enc); 5736 break; 5737 case T_INT: 5738 evpexpandd(dst, mask, src, merge, vec_enc); 5739 break; 5740 case T_FLOAT: 5741 evexpandps(dst, mask, src, merge, vec_enc); 5742 break; 5743 case T_LONG: 5744 evpexpandq(dst, mask, src, merge, vec_enc); 5745 break; 5746 case T_DOUBLE: 5747 evexpandpd(dst, mask, src, merge, vec_enc); 5748 break; 5749 default: 5750 fatal("Unsupported type %s", type2name(bt)); 5751 break; 5752 } 5753 } 5754 } 5755 5756 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5757 KRegister ktmp1, int vec_enc) { 5758 if (opcode == Op_SignumVD) { 5759 vsubpd(dst, zero, one, vec_enc); 5760 // if src < 0 ? -1 : 1 5761 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5762 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5763 // if src == NaN, -0.0 or 0.0 return src. 5764 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5765 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5766 } else { 5767 assert(opcode == Op_SignumVF, ""); 5768 vsubps(dst, zero, one, vec_enc); 5769 // if src < 0 ? -1 : 1 5770 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5771 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5772 // if src == NaN, -0.0 or 0.0 return src. 5773 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5774 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5775 } 5776 } 5777 5778 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5779 XMMRegister xtmp1, int vec_enc) { 5780 if (opcode == Op_SignumVD) { 5781 vsubpd(dst, zero, one, vec_enc); 5782 // if src < 0 ? -1 : 1 5783 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5784 // if src == NaN, -0.0 or 0.0 return src. 5785 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5786 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5787 } else { 5788 assert(opcode == Op_SignumVF, ""); 5789 vsubps(dst, zero, one, vec_enc); 5790 // if src < 0 ? -1 : 1 5791 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5792 // if src == NaN, -0.0 or 0.0 return src. 5793 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5794 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5795 } 5796 } 5797 5798 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5799 if (VM_Version::supports_avx512bw()) { 5800 if (mask_len > 32) { 5801 kmovql(dst, src); 5802 } else { 5803 kmovdl(dst, src); 5804 if (mask_len != 32) { 5805 kshiftrdl(dst, dst, 32 - mask_len); 5806 } 5807 } 5808 } else { 5809 assert(mask_len <= 16, ""); 5810 kmovwl(dst, src); 5811 if (mask_len != 16) { 5812 kshiftrwl(dst, dst, 16 - mask_len); 5813 } 5814 } 5815 } 5816 5817 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5818 int lane_size = type2aelembytes(bt); 5819 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5820 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) { 5821 movptr(rtmp, imm32); 5822 switch(lane_size) { 5823 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5824 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5825 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5826 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5827 fatal("Unsupported lane size %d", lane_size); 5828 break; 5829 } 5830 } else { 5831 movptr(rtmp, imm32); 5832 movq(dst, rtmp); 5833 switch(lane_size) { 5834 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5835 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5836 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5837 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5838 fatal("Unsupported lane size %d", lane_size); 5839 break; 5840 } 5841 } 5842 } 5843 5844 // 5845 // Following is lookup table based popcount computation algorithm:- 5846 // Index Bit set count 5847 // [ 0000 -> 0, 5848 // 0001 -> 1, 5849 // 0010 -> 1, 5850 // 0011 -> 2, 5851 // 0100 -> 1, 5852 // 0101 -> 2, 5853 // 0110 -> 2, 5854 // 0111 -> 3, 5855 // 1000 -> 1, 5856 // 1001 -> 2, 5857 // 1010 -> 3, 5858 // 1011 -> 3, 5859 // 1100 -> 2, 5860 // 1101 -> 3, 5861 // 1111 -> 4 ] 5862 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5863 // shuffle indices for lookup table access. 5864 // b. Right shift each byte of vector lane by 4 positions. 5865 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5866 // shuffle indices for lookup table access. 5867 // d. Add the bitset count of upper and lower 4 bits of each byte. 5868 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5869 // count of all the bytes of a quadword. 5870 // f. Perform step e. for upper 128bit vector lane. 5871 // g. Pack the bitset count of quadwords back to double word. 5872 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5873 5874 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5875 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5876 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5877 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5878 vpsrlw(dst, src, 4, vec_enc); 5879 vpand(dst, dst, xtmp1, vec_enc); 5880 vpand(xtmp1, src, xtmp1, vec_enc); 5881 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5882 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5883 vpshufb(dst, xtmp2, dst, vec_enc); 5884 vpaddb(dst, dst, xtmp1, vec_enc); 5885 } 5886 5887 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5888 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5889 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5890 // Following code is as per steps e,f,g and h of above algorithm. 5891 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5892 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5893 vpsadbw(dst, dst, xtmp2, vec_enc); 5894 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5895 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5896 vpackuswb(dst, xtmp1, dst, vec_enc); 5897 } 5898 5899 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5900 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5901 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5902 // Add the popcount of upper and lower bytes of word. 5903 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5904 vpsrlw(dst, xtmp1, 8, vec_enc); 5905 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5906 vpaddw(dst, dst, xtmp1, vec_enc); 5907 } 5908 5909 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5910 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5911 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5912 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5913 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5914 } 5915 5916 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5917 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5918 switch(bt) { 5919 case T_LONG: 5920 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5921 break; 5922 case T_INT: 5923 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5924 break; 5925 case T_CHAR: 5926 case T_SHORT: 5927 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5928 break; 5929 case T_BYTE: 5930 case T_BOOLEAN: 5931 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5932 break; 5933 default: 5934 fatal("Unsupported type %s", type2name(bt)); 5935 break; 5936 } 5937 } 5938 5939 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5940 KRegister mask, bool merge, int vec_enc) { 5941 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5942 switch(bt) { 5943 case T_LONG: 5944 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5945 evpopcntq(dst, mask, src, merge, vec_enc); 5946 break; 5947 case T_INT: 5948 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5949 evpopcntd(dst, mask, src, merge, vec_enc); 5950 break; 5951 case T_CHAR: 5952 case T_SHORT: 5953 assert(VM_Version::supports_avx512_bitalg(), ""); 5954 evpopcntw(dst, mask, src, merge, vec_enc); 5955 break; 5956 case T_BYTE: 5957 case T_BOOLEAN: 5958 assert(VM_Version::supports_avx512_bitalg(), ""); 5959 evpopcntb(dst, mask, src, merge, vec_enc); 5960 break; 5961 default: 5962 fatal("Unsupported type %s", type2name(bt)); 5963 break; 5964 } 5965 } 5966 5967 // Bit reversal algorithm first reverses the bits of each byte followed by 5968 // a byte level reversal for multi-byte primitive types (short/int/long). 5969 // Algorithm performs a lookup table access to get reverse bit sequence 5970 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5971 // is obtained by swapping the reverse bit sequences of upper and lower 5972 // nibble of a byte. 5973 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5974 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5975 if (VM_Version::supports_avx512vlbw()) { 5976 5977 // Get the reverse bit sequence of lower nibble of each byte. 5978 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5979 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5980 evpandq(dst, xtmp2, src, vec_enc); 5981 vpshufb(dst, xtmp1, dst, vec_enc); 5982 vpsllq(dst, dst, 4, vec_enc); 5983 5984 // Get the reverse bit sequence of upper nibble of each byte. 5985 vpandn(xtmp2, xtmp2, src, vec_enc); 5986 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5987 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5988 5989 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5990 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5991 evporq(xtmp2, dst, xtmp2, vec_enc); 5992 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5993 5994 } else if(vec_enc == Assembler::AVX_512bit) { 5995 // Shift based bit reversal. 5996 assert(bt == T_LONG || bt == T_INT, ""); 5997 5998 // Swap lower and upper nibble of each byte. 5999 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 6000 6001 // Swap two least and most significant bits of each nibble. 6002 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 6003 6004 // Swap adjacent pair of bits. 6005 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6006 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 6007 6008 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6009 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 6010 } else { 6011 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 6012 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6013 6014 // Get the reverse bit sequence of lower nibble of each byte. 6015 vpand(dst, xtmp2, src, vec_enc); 6016 vpshufb(dst, xtmp1, dst, vec_enc); 6017 vpsllq(dst, dst, 4, vec_enc); 6018 6019 // Get the reverse bit sequence of upper nibble of each byte. 6020 vpandn(xtmp2, xtmp2, src, vec_enc); 6021 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6022 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6023 6024 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6025 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6026 vpor(xtmp2, dst, xtmp2, vec_enc); 6027 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6028 } 6029 } 6030 6031 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 6032 XMMRegister xtmp, Register rscratch) { 6033 assert(VM_Version::supports_gfni(), ""); 6034 assert(rscratch != noreg || always_reachable(mask), "missing"); 6035 6036 // Galois field instruction based bit reversal based on following algorithm. 6037 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6038 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 6039 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 6040 vector_reverse_byte(bt, dst, xtmp, vec_enc); 6041 } 6042 6043 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 6044 XMMRegister xtmp1, Register rtmp, int vec_enc) { 6045 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 6046 evpandq(dst, xtmp1, src, vec_enc); 6047 vpsllq(dst, dst, nbits, vec_enc); 6048 vpandn(xtmp1, xtmp1, src, vec_enc); 6049 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 6050 evporq(dst, dst, xtmp1, vec_enc); 6051 } 6052 6053 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6054 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6055 // Shift based bit reversal. 6056 assert(VM_Version::supports_evex(), ""); 6057 switch(bt) { 6058 case T_LONG: 6059 // Swap upper and lower double word of each quad word. 6060 evprorq(xtmp1, k0, src, 32, true, vec_enc); 6061 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 6062 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6063 break; 6064 case T_INT: 6065 // Swap upper and lower word of each double word. 6066 evprord(xtmp1, k0, src, 16, true, vec_enc); 6067 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6068 break; 6069 case T_CHAR: 6070 case T_SHORT: 6071 // Swap upper and lower byte of each word. 6072 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6073 break; 6074 case T_BYTE: 6075 evmovdquq(dst, k0, src, true, vec_enc); 6076 break; 6077 default: 6078 fatal("Unsupported type %s", type2name(bt)); 6079 break; 6080 } 6081 } 6082 6083 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6084 if (bt == T_BYTE) { 6085 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6086 evmovdquq(dst, k0, src, true, vec_enc); 6087 } else { 6088 vmovdqu(dst, src); 6089 } 6090 return; 6091 } 6092 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6093 // pre-computed shuffle indices. 6094 switch(bt) { 6095 case T_LONG: 6096 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6097 break; 6098 case T_INT: 6099 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6100 break; 6101 case T_CHAR: 6102 case T_SHORT: 6103 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6104 break; 6105 default: 6106 fatal("Unsupported type %s", type2name(bt)); 6107 break; 6108 } 6109 vpshufb(dst, src, dst, vec_enc); 6110 } 6111 6112 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6113 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6114 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6115 assert(is_integral_type(bt), ""); 6116 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6117 assert(VM_Version::supports_avx512cd(), ""); 6118 switch(bt) { 6119 case T_LONG: 6120 evplzcntq(dst, ktmp, src, merge, vec_enc); 6121 break; 6122 case T_INT: 6123 evplzcntd(dst, ktmp, src, merge, vec_enc); 6124 break; 6125 case T_SHORT: 6126 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6127 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6128 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6129 vpunpckhwd(dst, xtmp1, src, vec_enc); 6130 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6131 vpackusdw(dst, xtmp2, dst, vec_enc); 6132 break; 6133 case T_BYTE: 6134 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6135 // accessing the lookup table. 6136 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6137 // accessing the lookup table. 6138 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6139 assert(VM_Version::supports_avx512bw(), ""); 6140 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6141 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6142 vpand(xtmp2, dst, src, vec_enc); 6143 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6144 vpsrlw(xtmp3, src, 4, vec_enc); 6145 vpand(xtmp3, dst, xtmp3, vec_enc); 6146 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6147 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6148 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6149 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6150 break; 6151 default: 6152 fatal("Unsupported type %s", type2name(bt)); 6153 break; 6154 } 6155 } 6156 6157 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6158 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6159 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6160 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6161 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6162 // accessing the lookup table. 6163 vpand(dst, xtmp2, src, vec_enc); 6164 vpshufb(dst, xtmp1, dst, vec_enc); 6165 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6166 // accessing the lookup table. 6167 vpsrlw(xtmp3, src, 4, vec_enc); 6168 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6169 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6170 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6171 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6172 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6173 vpaddb(dst, dst, xtmp2, vec_enc); 6174 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6175 } 6176 6177 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6178 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6179 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6180 // Add zero counts of lower byte and upper byte of a word if 6181 // upper byte holds a zero value. 6182 vpsrlw(xtmp3, src, 8, vec_enc); 6183 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6184 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6185 vpsllw(xtmp2, dst, 8, vec_enc); 6186 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6187 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6188 vpsrlw(dst, dst, 8, vec_enc); 6189 } 6190 6191 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6192 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6193 // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float. 6194 // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the 6195 // exponent as the leading zero count. 6196 6197 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher 6198 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit 6199 // contributes to the leading number of zeros. 6200 vpsrld(dst, src, 1, vec_enc); 6201 vpandn(dst, dst, src, vec_enc); 6202 6203 vcvtdq2ps(dst, dst, vec_enc); 6204 6205 // By comparing the register to itself, all the bits in the destination are set. 6206 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6207 6208 // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit. 6209 vpsrld(xtmp2, xtmp1, 24, vec_enc); 6210 vpsrld(dst, dst, 23, vec_enc); 6211 vpand(dst, xtmp2, dst, vec_enc); 6212 6213 // Subtract 127 from the exponent, which removes the bias from the exponent. 6214 vpsrld(xtmp2, xtmp1, 25, vec_enc); 6215 vpsubd(dst, dst, xtmp2, vec_enc); 6216 6217 vpsrld(xtmp2, xtmp1, 27, vec_enc); 6218 6219 // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this 6220 // is found in any of the lanes, replace the lane with -1 from xtmp1. 6221 vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3); 6222 6223 // If the original value is negative, replace the lane with 31. 6224 vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3); 6225 6226 // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1, 6227 // and for negative numbers the result is 0 as the exponent was replaced with 31. 6228 vpsubd(dst, xtmp2, dst, vec_enc); 6229 } 6230 6231 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6232 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6233 // Find the leading zeros of the top and bottom halves of the long individually. 6234 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6235 6236 // Move the top half result to the bottom half of xtmp1, setting the top half to 0. 6237 vpsrlq(xtmp1, dst, 32, vec_enc); 6238 // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will 6239 // be in the most significant position of the bottom half. 6240 vpsrlq(xtmp2, dst, 6, vec_enc); 6241 6242 // In the bottom half, add the top half and bottom half results. 6243 vpaddq(dst, xtmp1, dst, vec_enc); 6244 6245 // For the bottom half, choose between the values using the most significant bit of xtmp2. 6246 // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen, 6247 // which contains only the top half result. 6248 // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears 6249 // the lane as required. 6250 vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3); 6251 } 6252 6253 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6254 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6255 Register rtmp, int vec_enc) { 6256 assert(is_integral_type(bt), "unexpected type"); 6257 assert(vec_enc < Assembler::AVX_512bit, ""); 6258 switch(bt) { 6259 case T_LONG: 6260 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6261 break; 6262 case T_INT: 6263 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6264 break; 6265 case T_SHORT: 6266 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6267 break; 6268 case T_BYTE: 6269 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6270 break; 6271 default: 6272 fatal("Unsupported type %s", type2name(bt)); 6273 break; 6274 } 6275 } 6276 6277 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6278 switch(bt) { 6279 case T_BYTE: 6280 vpsubb(dst, src1, src2, vec_enc); 6281 break; 6282 case T_SHORT: 6283 vpsubw(dst, src1, src2, vec_enc); 6284 break; 6285 case T_INT: 6286 vpsubd(dst, src1, src2, vec_enc); 6287 break; 6288 case T_LONG: 6289 vpsubq(dst, src1, src2, vec_enc); 6290 break; 6291 default: 6292 fatal("Unsupported type %s", type2name(bt)); 6293 break; 6294 } 6295 } 6296 6297 // Trailing zero count computation is based on leading zero count operation as per 6298 // following equation. All AVX3 targets support AVX512CD feature which offers 6299 // direct vector instruction to compute leading zero count. 6300 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6301 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6302 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6303 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6304 assert(is_integral_type(bt), ""); 6305 // xtmp = -1 6306 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6307 // xtmp = xtmp + src 6308 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6309 // xtmp = xtmp & ~src 6310 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6311 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6312 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6313 vpsub(bt, dst, xtmp4, dst, vec_enc); 6314 } 6315 6316 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6317 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6318 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6319 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6320 assert(is_integral_type(bt), ""); 6321 // xtmp = 0 6322 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6323 // xtmp = 0 - src 6324 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6325 // xtmp = xtmp | src 6326 vpor(xtmp3, xtmp3, src, vec_enc); 6327 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6328 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6329 vpsub(bt, dst, xtmp1, dst, vec_enc); 6330 } 6331 6332 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6333 Label done; 6334 Label neg_divisor_fastpath; 6335 cmpl(divisor, 0); 6336 jccb(Assembler::less, neg_divisor_fastpath); 6337 xorl(rdx, rdx); 6338 divl(divisor); 6339 jmpb(done); 6340 bind(neg_divisor_fastpath); 6341 // Fastpath for divisor < 0: 6342 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6343 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6344 movl(rdx, rax); 6345 subl(rdx, divisor); 6346 if (VM_Version::supports_bmi1()) { 6347 andnl(rax, rdx, rax); 6348 } else { 6349 notl(rdx); 6350 andl(rax, rdx); 6351 } 6352 shrl(rax, 31); 6353 bind(done); 6354 } 6355 6356 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6357 Label done; 6358 Label neg_divisor_fastpath; 6359 cmpl(divisor, 0); 6360 jccb(Assembler::less, neg_divisor_fastpath); 6361 xorl(rdx, rdx); 6362 divl(divisor); 6363 jmpb(done); 6364 bind(neg_divisor_fastpath); 6365 // Fastpath when divisor < 0: 6366 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6367 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6368 movl(rdx, rax); 6369 subl(rax, divisor); 6370 if (VM_Version::supports_bmi1()) { 6371 andnl(rax, rax, rdx); 6372 } else { 6373 notl(rax); 6374 andl(rax, rdx); 6375 } 6376 sarl(rax, 31); 6377 andl(rax, divisor); 6378 subl(rdx, rax); 6379 bind(done); 6380 } 6381 6382 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6383 Label done; 6384 Label neg_divisor_fastpath; 6385 6386 cmpl(divisor, 0); 6387 jccb(Assembler::less, neg_divisor_fastpath); 6388 xorl(rdx, rdx); 6389 divl(divisor); 6390 jmpb(done); 6391 bind(neg_divisor_fastpath); 6392 // Fastpath for divisor < 0: 6393 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6394 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6395 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6396 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6397 movl(rdx, rax); 6398 subl(rax, divisor); 6399 if (VM_Version::supports_bmi1()) { 6400 andnl(rax, rax, rdx); 6401 } else { 6402 notl(rax); 6403 andl(rax, rdx); 6404 } 6405 movl(tmp, rax); 6406 shrl(rax, 31); // quotient 6407 sarl(tmp, 31); 6408 andl(tmp, divisor); 6409 subl(rdx, tmp); // remainder 6410 bind(done); 6411 } 6412 6413 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6414 XMMRegister xtmp2, Register rtmp) { 6415 if(VM_Version::supports_gfni()) { 6416 // Galois field instruction based bit reversal based on following algorithm. 6417 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6418 mov64(rtmp, 0x8040201008040201L); 6419 movq(xtmp1, src); 6420 movq(xtmp2, rtmp); 6421 gf2p8affineqb(xtmp1, xtmp2, 0); 6422 movq(dst, xtmp1); 6423 } else { 6424 // Swap even and odd numbered bits. 6425 movl(rtmp, src); 6426 andl(rtmp, 0x55555555); 6427 shll(rtmp, 1); 6428 movl(dst, src); 6429 andl(dst, 0xAAAAAAAA); 6430 shrl(dst, 1); 6431 orl(dst, rtmp); 6432 6433 // Swap LSB and MSB 2 bits of each nibble. 6434 movl(rtmp, dst); 6435 andl(rtmp, 0x33333333); 6436 shll(rtmp, 2); 6437 andl(dst, 0xCCCCCCCC); 6438 shrl(dst, 2); 6439 orl(dst, rtmp); 6440 6441 // Swap LSB and MSB 4 bits of each byte. 6442 movl(rtmp, dst); 6443 andl(rtmp, 0x0F0F0F0F); 6444 shll(rtmp, 4); 6445 andl(dst, 0xF0F0F0F0); 6446 shrl(dst, 4); 6447 orl(dst, rtmp); 6448 } 6449 bswapl(dst); 6450 } 6451 6452 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6453 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6454 if(VM_Version::supports_gfni()) { 6455 // Galois field instruction based bit reversal based on following algorithm. 6456 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6457 mov64(rtmp1, 0x8040201008040201L); 6458 movq(xtmp1, src); 6459 movq(xtmp2, rtmp1); 6460 gf2p8affineqb(xtmp1, xtmp2, 0); 6461 movq(dst, xtmp1); 6462 } else { 6463 // Swap even and odd numbered bits. 6464 movq(rtmp1, src); 6465 mov64(rtmp2, 0x5555555555555555L); 6466 andq(rtmp1, rtmp2); 6467 shlq(rtmp1, 1); 6468 movq(dst, src); 6469 notq(rtmp2); 6470 andq(dst, rtmp2); 6471 shrq(dst, 1); 6472 orq(dst, rtmp1); 6473 6474 // Swap LSB and MSB 2 bits of each nibble. 6475 movq(rtmp1, dst); 6476 mov64(rtmp2, 0x3333333333333333L); 6477 andq(rtmp1, rtmp2); 6478 shlq(rtmp1, 2); 6479 notq(rtmp2); 6480 andq(dst, rtmp2); 6481 shrq(dst, 2); 6482 orq(dst, rtmp1); 6483 6484 // Swap LSB and MSB 4 bits of each byte. 6485 movq(rtmp1, dst); 6486 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6487 andq(rtmp1, rtmp2); 6488 shlq(rtmp1, 4); 6489 notq(rtmp2); 6490 andq(dst, rtmp2); 6491 shrq(dst, 4); 6492 orq(dst, rtmp1); 6493 } 6494 bswapq(dst); 6495 } 6496 6497 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6498 Label done; 6499 Label neg_divisor_fastpath; 6500 cmpq(divisor, 0); 6501 jccb(Assembler::less, neg_divisor_fastpath); 6502 xorl(rdx, rdx); 6503 divq(divisor); 6504 jmpb(done); 6505 bind(neg_divisor_fastpath); 6506 // Fastpath for divisor < 0: 6507 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6508 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6509 movq(rdx, rax); 6510 subq(rdx, divisor); 6511 if (VM_Version::supports_bmi1()) { 6512 andnq(rax, rdx, rax); 6513 } else { 6514 notq(rdx); 6515 andq(rax, rdx); 6516 } 6517 shrq(rax, 63); 6518 bind(done); 6519 } 6520 6521 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6522 Label done; 6523 Label neg_divisor_fastpath; 6524 cmpq(divisor, 0); 6525 jccb(Assembler::less, neg_divisor_fastpath); 6526 xorq(rdx, rdx); 6527 divq(divisor); 6528 jmp(done); 6529 bind(neg_divisor_fastpath); 6530 // Fastpath when divisor < 0: 6531 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6532 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6533 movq(rdx, rax); 6534 subq(rax, divisor); 6535 if (VM_Version::supports_bmi1()) { 6536 andnq(rax, rax, rdx); 6537 } else { 6538 notq(rax); 6539 andq(rax, rdx); 6540 } 6541 sarq(rax, 63); 6542 andq(rax, divisor); 6543 subq(rdx, rax); 6544 bind(done); 6545 } 6546 6547 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6548 Label done; 6549 Label neg_divisor_fastpath; 6550 cmpq(divisor, 0); 6551 jccb(Assembler::less, neg_divisor_fastpath); 6552 xorq(rdx, rdx); 6553 divq(divisor); 6554 jmp(done); 6555 bind(neg_divisor_fastpath); 6556 // Fastpath for divisor < 0: 6557 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6558 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6559 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6560 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6561 movq(rdx, rax); 6562 subq(rax, divisor); 6563 if (VM_Version::supports_bmi1()) { 6564 andnq(rax, rax, rdx); 6565 } else { 6566 notq(rax); 6567 andq(rax, rdx); 6568 } 6569 movq(tmp, rax); 6570 shrq(rax, 63); // quotient 6571 sarq(tmp, 63); 6572 andq(tmp, divisor); 6573 subq(rdx, tmp); // remainder 6574 bind(done); 6575 } 6576 6577 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6578 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6579 int vlen_enc) { 6580 assert(VM_Version::supports_avx512bw(), ""); 6581 // Byte shuffles are inlane operations and indices are determined using 6582 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6583 // normalized to index range 0-15. This makes sure that all the multiples 6584 // of an index value are placed at same relative position in 128 bit 6585 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6586 // will be 16th element in their respective 128 bit lanes. 6587 movl(rtmp, 16); 6588 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6589 6590 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6591 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6592 // original shuffle indices and move the shuffled lanes corresponding to true 6593 // mask to destination vector. 6594 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6595 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6596 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6597 6598 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6599 // and broadcasting second 128 bit lane. 6600 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6601 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6602 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6603 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6604 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6605 6606 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6607 // and broadcasting third 128 bit lane. 6608 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6609 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6610 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6611 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6612 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6613 6614 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6615 // and broadcasting third 128 bit lane. 6616 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6617 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6618 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6619 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6620 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6621 } 6622 6623 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6624 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6625 if (vlen_enc == AVX_128bit) { 6626 vpermilps(dst, src, shuffle, vlen_enc); 6627 } else if (bt == T_INT) { 6628 vpermd(dst, shuffle, src, vlen_enc); 6629 } else { 6630 assert(bt == T_FLOAT, ""); 6631 vpermps(dst, shuffle, src, vlen_enc); 6632 } 6633 } 6634 6635 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 6636 switch(opcode) { 6637 case Op_AddHF: vaddsh(dst, src1, src2); break; 6638 case Op_SubHF: vsubsh(dst, src1, src2); break; 6639 case Op_MulHF: vmulsh(dst, src1, src2); break; 6640 case Op_DivHF: vdivsh(dst, src1, src2); break; 6641 default: assert(false, "%s", NodeClassNames[opcode]); break; 6642 } 6643 } 6644 6645 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6646 switch(elem_bt) { 6647 case T_BYTE: 6648 if (ideal_opc == Op_SaturatingAddV) { 6649 vpaddsb(dst, src1, src2, vlen_enc); 6650 } else { 6651 assert(ideal_opc == Op_SaturatingSubV, ""); 6652 vpsubsb(dst, src1, src2, vlen_enc); 6653 } 6654 break; 6655 case T_SHORT: 6656 if (ideal_opc == Op_SaturatingAddV) { 6657 vpaddsw(dst, src1, src2, vlen_enc); 6658 } else { 6659 assert(ideal_opc == Op_SaturatingSubV, ""); 6660 vpsubsw(dst, src1, src2, vlen_enc); 6661 } 6662 break; 6663 default: 6664 fatal("Unsupported type %s", type2name(elem_bt)); 6665 break; 6666 } 6667 } 6668 6669 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6670 switch(elem_bt) { 6671 case T_BYTE: 6672 if (ideal_opc == Op_SaturatingAddV) { 6673 vpaddusb(dst, src1, src2, vlen_enc); 6674 } else { 6675 assert(ideal_opc == Op_SaturatingSubV, ""); 6676 vpsubusb(dst, src1, src2, vlen_enc); 6677 } 6678 break; 6679 case T_SHORT: 6680 if (ideal_opc == Op_SaturatingAddV) { 6681 vpaddusw(dst, src1, src2, vlen_enc); 6682 } else { 6683 assert(ideal_opc == Op_SaturatingSubV, ""); 6684 vpsubusw(dst, src1, src2, vlen_enc); 6685 } 6686 break; 6687 default: 6688 fatal("Unsupported type %s", type2name(elem_bt)); 6689 break; 6690 } 6691 } 6692 6693 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6694 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6695 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6696 // overflow_mask = Inp1 <u Inp2 6697 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6698 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6699 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6700 } 6701 6702 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6703 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6704 // Emulate unsigned comparison using signed comparison 6705 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6706 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6707 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6708 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6709 6710 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6711 6712 // Res = INP1 - INP2 (non-commutative and non-associative) 6713 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6714 // Res = Mask ? Zero : Res 6715 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6716 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6717 } 6718 6719 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6720 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6721 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6722 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6723 // Res = Signed Add INP1, INP2 6724 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6725 // T1 = SRC1 | SRC2 6726 vpor(xtmp1, src1, src2, vlen_enc); 6727 // Max_Unsigned = -1 6728 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6729 // Unsigned compare: Mask = Res <u T1 6730 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6731 // res = Mask ? Max_Unsigned : Res 6732 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6733 } 6734 6735 // 6736 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6737 // unsigned addition operation. 6738 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6739 // 6740 // We empirically determined its semantic equivalence to following reduced expression 6741 // overflow_mask = (a + b) <u (a | b) 6742 // 6743 // and also verified it though Alive2 solver. 6744 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6745 // 6746 6747 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6748 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6749 // Res = Signed Add INP1, INP2 6750 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6751 // Compute T1 = INP1 | INP2 6752 vpor(xtmp3, src1, src2, vlen_enc); 6753 // T1 = Minimum signed value. 6754 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6755 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6756 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6757 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6758 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6759 // Compute overflow detection mask = Res<1> <s T1 6760 if (elem_bt == T_INT) { 6761 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6762 } else { 6763 assert(elem_bt == T_LONG, ""); 6764 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6765 } 6766 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6767 } 6768 6769 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6770 int vlen_enc, bool xtmp2_hold_M1) { 6771 if (VM_Version::supports_avx512dq()) { 6772 evpmovq2m(ktmp, src, vlen_enc); 6773 } else { 6774 assert(VM_Version::supports_evex(), ""); 6775 if (!xtmp2_hold_M1) { 6776 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6777 } 6778 evpsraq(xtmp1, src, 63, vlen_enc); 6779 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6780 } 6781 } 6782 6783 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6784 int vlen_enc, bool xtmp2_hold_M1) { 6785 if (VM_Version::supports_avx512dq()) { 6786 evpmovd2m(ktmp, src, vlen_enc); 6787 } else { 6788 assert(VM_Version::supports_evex(), ""); 6789 if (!xtmp2_hold_M1) { 6790 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6791 } 6792 vpsrad(xtmp1, src, 31, vlen_enc); 6793 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6794 } 6795 } 6796 6797 6798 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6799 if (elem_bt == T_LONG) { 6800 if (VM_Version::supports_evex()) { 6801 evpsraq(dst, src, 63, vlen_enc); 6802 } else { 6803 vpsrad(dst, src, 31, vlen_enc); 6804 vpshufd(dst, dst, 0xF5, vlen_enc); 6805 } 6806 } else { 6807 assert(elem_bt == T_INT, ""); 6808 vpsrad(dst, src, 31, vlen_enc); 6809 } 6810 } 6811 6812 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6813 if (compute_allones) { 6814 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6815 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6816 } else { 6817 vpcmpeqq(allones, allones, allones, vlen_enc); 6818 } 6819 } 6820 if (elem_bt == T_LONG) { 6821 vpsrlq(dst, allones, 1, vlen_enc); 6822 } else { 6823 assert(elem_bt == T_INT, ""); 6824 vpsrld(dst, allones, 1, vlen_enc); 6825 } 6826 } 6827 6828 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6829 if (compute_allones) { 6830 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6831 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6832 } else { 6833 vpcmpeqq(allones, allones, allones, vlen_enc); 6834 } 6835 } 6836 if (elem_bt == T_LONG) { 6837 vpsllq(dst, allones, 63, vlen_enc); 6838 } else { 6839 assert(elem_bt == T_INT, ""); 6840 vpslld(dst, allones, 31, vlen_enc); 6841 } 6842 } 6843 6844 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6845 Assembler::ComparisonPredicate cond, int vlen_enc) { 6846 switch(elem_bt) { 6847 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6848 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6849 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6850 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6851 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6852 } 6853 } 6854 6855 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6856 switch(elem_bt) { 6857 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6858 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6859 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6860 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6861 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6862 } 6863 } 6864 6865 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6866 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6867 if (elem_bt == T_LONG) { 6868 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6869 } else { 6870 assert(elem_bt == T_INT, ""); 6871 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6872 } 6873 } 6874 6875 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6876 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6877 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6878 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6879 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6880 // Overflow detection based on Hacker's delight section 2-13. 6881 if (ideal_opc == Op_SaturatingAddV) { 6882 // res = src1 + src2 6883 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6884 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6885 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6886 vpxor(xtmp1, dst, src1, vlen_enc); 6887 vpxor(xtmp2, dst, src2, vlen_enc); 6888 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6889 } else { 6890 assert(ideal_opc == Op_SaturatingSubV, ""); 6891 // res = src1 - src2 6892 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6893 // Overflow occurs when both inputs have opposite polarity and 6894 // result polarity does not comply with first input polarity. 6895 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6896 vpxor(xtmp1, src1, src2, vlen_enc); 6897 vpxor(xtmp2, dst, src1, vlen_enc); 6898 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6899 } 6900 6901 // Compute overflow detection mask. 6902 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6903 // Note: xtmp1 hold -1 in all its lanes after above call. 6904 6905 // Compute mask based on first input polarity. 6906 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6907 6908 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6909 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6910 6911 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6912 // set bits in first input polarity mask holds a min value. 6913 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6914 // Blend destination lanes with saturated values using overflow detection mask. 6915 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6916 } 6917 6918 6919 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6920 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6921 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 6922 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6923 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6924 // Overflow detection based on Hacker's delight section 2-13. 6925 if (ideal_opc == Op_SaturatingAddV) { 6926 // res = src1 + src2 6927 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6928 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6929 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6930 vpxor(xtmp1, dst, src1, vlen_enc); 6931 vpxor(xtmp2, dst, src2, vlen_enc); 6932 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6933 } else { 6934 assert(ideal_opc == Op_SaturatingSubV, ""); 6935 // res = src1 - src2 6936 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6937 // Overflow occurs when both inputs have opposite polarity and 6938 // result polarity does not comply with first input polarity. 6939 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6940 vpxor(xtmp1, src1, src2, vlen_enc); 6941 vpxor(xtmp2, dst, src1, vlen_enc); 6942 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6943 } 6944 6945 // Sign-extend to compute overflow detection mask. 6946 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 6947 6948 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 6949 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 6950 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6951 6952 // Compose saturating min/max vector using first input polarity mask. 6953 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 6954 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 6955 6956 // Blend result with saturating vector using overflow detection mask. 6957 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6958 } 6959 6960 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6961 switch(elem_bt) { 6962 case T_BYTE: 6963 if (ideal_opc == Op_SaturatingAddV) { 6964 vpaddsb(dst, src1, src2, vlen_enc); 6965 } else { 6966 assert(ideal_opc == Op_SaturatingSubV, ""); 6967 vpsubsb(dst, src1, src2, vlen_enc); 6968 } 6969 break; 6970 case T_SHORT: 6971 if (ideal_opc == Op_SaturatingAddV) { 6972 vpaddsw(dst, src1, src2, vlen_enc); 6973 } else { 6974 assert(ideal_opc == Op_SaturatingSubV, ""); 6975 vpsubsw(dst, src1, src2, vlen_enc); 6976 } 6977 break; 6978 default: 6979 fatal("Unsupported type %s", type2name(elem_bt)); 6980 break; 6981 } 6982 } 6983 6984 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6985 switch(elem_bt) { 6986 case T_BYTE: 6987 if (ideal_opc == Op_SaturatingAddV) { 6988 vpaddusb(dst, src1, src2, vlen_enc); 6989 } else { 6990 assert(ideal_opc == Op_SaturatingSubV, ""); 6991 vpsubusb(dst, src1, src2, vlen_enc); 6992 } 6993 break; 6994 case T_SHORT: 6995 if (ideal_opc == Op_SaturatingAddV) { 6996 vpaddusw(dst, src1, src2, vlen_enc); 6997 } else { 6998 assert(ideal_opc == Op_SaturatingSubV, ""); 6999 vpsubusw(dst, src1, src2, vlen_enc); 7000 } 7001 break; 7002 default: 7003 fatal("Unsupported type %s", type2name(elem_bt)); 7004 break; 7005 } 7006 } 7007 7008 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 7009 XMMRegister src2, int vlen_enc) { 7010 switch(elem_bt) { 7011 case T_BYTE: 7012 evpermi2b(dst, src1, src2, vlen_enc); 7013 break; 7014 case T_SHORT: 7015 evpermi2w(dst, src1, src2, vlen_enc); 7016 break; 7017 case T_INT: 7018 evpermi2d(dst, src1, src2, vlen_enc); 7019 break; 7020 case T_LONG: 7021 evpermi2q(dst, src1, src2, vlen_enc); 7022 break; 7023 case T_FLOAT: 7024 evpermi2ps(dst, src1, src2, vlen_enc); 7025 break; 7026 case T_DOUBLE: 7027 evpermi2pd(dst, src1, src2, vlen_enc); 7028 break; 7029 default: 7030 fatal("Unsupported type %s", type2name(elem_bt)); 7031 break; 7032 } 7033 } 7034 7035 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 7036 if (is_unsigned) { 7037 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7038 } else { 7039 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7040 } 7041 } 7042 7043 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 7044 if (is_unsigned) { 7045 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7046 } else { 7047 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7048 } 7049 } 7050 7051 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 7052 switch(opcode) { 7053 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7054 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7055 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7056 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7057 default: assert(false, "%s", NodeClassNames[opcode]); break; 7058 } 7059 } 7060 7061 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7062 switch(opcode) { 7063 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7064 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7065 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7066 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7067 default: assert(false, "%s", NodeClassNames[opcode]); break; 7068 } 7069 } 7070 7071 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7072 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) { 7073 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit); 7074 } 7075 7076 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7077 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 7078 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) { 7079 // Move sign bits of src2 to mask register. 7080 evpmovw2m(ktmp, src2, vlen_enc); 7081 // xtmp1 = src2 < 0 ? src2 : src1 7082 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7083 // xtmp2 = src2 < 0 ? ? src1 : src2 7084 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7085 // Idea behind above swapping is to make seconds source operand a +ve value. 7086 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in 7087 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction, 7088 // the second source operand, either a NaN or a valid floating-point value, is returned 7089 // dst = max(xtmp1, xtmp2) 7090 evmaxph(dst, xtmp1, xtmp2, vlen_enc); 7091 // isNaN = is_unordered_quiet(xtmp1) 7092 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7093 // Final result is same as first source if its a NaN value, 7094 // in case second operand holds a NaN value then as per above semantics 7095 // result is same as second operand. 7096 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7097 } else { 7098 assert(opcode == Op_MinVHF || opcode == Op_MinHF, ""); 7099 // Move sign bits of src1 to mask register. 7100 evpmovw2m(ktmp, src1, vlen_enc); 7101 // xtmp1 = src1 < 0 ? src2 : src1 7102 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7103 // xtmp2 = src1 < 0 ? src1 : src2 7104 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7105 // Idea behind above swapping is to make seconds source operand a -ve value. 7106 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in 7107 // the second source operand is returned. 7108 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN 7109 // or a valid floating-point value, is written to the result. 7110 // dst = min(xtmp1, xtmp2) 7111 evminph(dst, xtmp1, xtmp2, vlen_enc); 7112 // isNaN = is_unordered_quiet(xtmp1) 7113 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7114 // Final result is same as first source if its a NaN value, 7115 // in case second operand holds a NaN value then as per above semantics 7116 // result is same as second operand. 7117 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7118 } 7119 } --- EOF ---