1 /* 2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/methodData.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/opcodes.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/globals.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/objectMonitorTable.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "runtime/synchronizer.hpp" 40 #include "utilities/checkedCast.hpp" 41 #include "utilities/globalDefinitions.hpp" 42 #include "utilities/powerOfTwo.hpp" 43 #include "utilities/sizes.hpp" 44 45 #ifdef PRODUCT 46 #define BLOCK_COMMENT(str) /* nothing */ 47 #define STOP(error) stop(error) 48 #else 49 #define BLOCK_COMMENT(str) block_comment(str) 50 #define STOP(error) block_comment(error); stop(error) 51 #endif 52 53 // C2 compiled method's prolog code. 54 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 55 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 56 57 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 58 // Remove word for return addr 59 framesize -= wordSize; 60 stack_bang_size -= wordSize; 61 62 // Calls to C2R adapters often do not accept exceptional returns. 63 // We require that their callers must bang for them. But be careful, because 64 // some VM calls (such as call site linkage) can use several kilobytes of 65 // stack. But the stack safety zone should account for that. 66 // See bugs 4446381, 4468289, 4497237. 67 if (stack_bang_size > 0) { 68 generate_stack_overflow_check(stack_bang_size); 69 70 // We always push rbp, so that on return to interpreter rbp, will be 71 // restored correctly and we can correct the stack. 72 push(rbp); 73 // Save caller's stack pointer into RBP if the frame pointer is preserved. 74 if (PreserveFramePointer) { 75 mov(rbp, rsp); 76 } 77 // Remove word for ebp 78 framesize -= wordSize; 79 80 // Create frame 81 if (framesize) { 82 subptr(rsp, framesize); 83 } 84 } else { 85 subptr(rsp, framesize); 86 87 // Save RBP register now. 88 framesize -= wordSize; 89 movptr(Address(rsp, framesize), rbp); 90 // Save caller's stack pointer into RBP if the frame pointer is preserved. 91 if (PreserveFramePointer) { 92 movptr(rbp, rsp); 93 if (framesize > 0) { 94 addptr(rbp, framesize); 95 } 96 } 97 } 98 99 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 100 framesize -= wordSize; 101 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 102 } 103 104 #ifdef ASSERT 105 if (VerifyStackAtCalls) { 106 Label L; 107 push(rax); 108 mov(rax, rsp); 109 andptr(rax, StackAlignmentInBytes-1); 110 cmpptr(rax, StackAlignmentInBytes-wordSize); 111 pop(rax); 112 jcc(Assembler::equal, L); 113 STOP("Stack is not properly aligned!"); 114 bind(L); 115 } 116 #endif 117 118 if (!is_stub) { 119 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 120 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 121 Label dummy_slow_path; 122 Label dummy_continuation; 123 Label* slow_path = &dummy_slow_path; 124 Label* continuation = &dummy_continuation; 125 if (!Compile::current()->output()->in_scratch_emit_size()) { 126 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 127 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 128 Compile::current()->output()->add_stub(stub); 129 slow_path = &stub->entry(); 130 continuation = &stub->continuation(); 131 } 132 bs->nmethod_entry_barrier(this, slow_path, continuation); 133 } 134 } 135 136 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 137 switch (vlen_in_bytes) { 138 case 4: // fall-through 139 case 8: // fall-through 140 case 16: return Assembler::AVX_128bit; 141 case 32: return Assembler::AVX_256bit; 142 case 64: return Assembler::AVX_512bit; 143 144 default: { 145 ShouldNotReachHere(); 146 return Assembler::AVX_NoVec; 147 } 148 } 149 } 150 151 // fast_lock and fast_unlock used by C2 152 153 // Because the transitions from emitted code to the runtime 154 // monitorenter/exit helper stubs are so slow it's critical that 155 // we inline both the lock-stack fast path and the inflated fast path. 156 // 157 // See also: cmpFastLock and cmpFastUnlock. 158 // 159 // What follows is a specialized inline transliteration of the code 160 // in enter() and exit(). If we're concerned about I$ bloat another 161 // option would be to emit TrySlowEnter and TrySlowExit methods 162 // at startup-time. These methods would accept arguments as 163 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 164 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 165 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 166 // In practice, however, the # of lock sites is bounded and is usually small. 167 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 168 // if the processor uses simple bimodal branch predictors keyed by EIP 169 // Since the helper routines would be called from multiple synchronization 170 // sites. 171 // 172 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 173 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 174 // to those specialized methods. That'd give us a mostly platform-independent 175 // implementation that the JITs could optimize and inline at their pleasure. 176 // Done correctly, the only time we'd need to cross to native could would be 177 // to park() or unpark() threads. We'd also need a few more unsafe operators 178 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 179 // (b) explicit barriers or fence operations. 180 // 181 // TODO: 182 // 183 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 184 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 185 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 186 // the lock operators would typically be faster than reifying Self. 187 // 188 // * Ideally I'd define the primitives as: 189 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 190 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 191 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 192 // Instead, we're stuck with a rather awkward and brittle register assignments below. 193 // Furthermore the register assignments are overconstrained, possibly resulting in 194 // sub-optimal code near the synchronization site. 195 // 196 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 197 // Alternately, use a better sp-proximity test. 198 // 199 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 200 // Either one is sufficient to uniquely identify a thread. 201 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 202 // 203 // * Intrinsify notify() and notifyAll() for the common cases where the 204 // object is locked by the calling thread but the waitlist is empty. 205 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 206 // 207 // * use jccb and jmpb instead of jcc and jmp to improve code density. 208 // But beware of excessive branch density on AMD Opterons. 209 // 210 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 211 // or failure of the fast path. If the fast path fails then we pass 212 // control to the slow path, typically in C. In fast_lock and 213 // fast_unlock we often branch to DONE_LABEL, just to find that C2 214 // will emit a conditional branch immediately after the node. 215 // So we have branches to branches and lots of ICC.ZF games. 216 // Instead, it might be better to have C2 pass a "FailureLabel" 217 // into fast_lock and fast_unlock. In the case of success, control 218 // will drop through the node. ICC.ZF is undefined at exit. 219 // In the case of failure, the node will branch directly to the 220 // FailureLabel 221 222 // obj: object to lock 223 // box: on-stack box address -- KILLED 224 // rax: tmp -- KILLED 225 // t : tmp -- KILLED 226 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg, 227 Register t, Register thread) { 228 assert(rax_reg == rax, "Used for CAS"); 229 assert_different_registers(obj, box, rax_reg, t, thread); 230 231 // Handle inflated monitor. 232 Label inflated; 233 // Finish fast lock successfully. ZF value is irrelevant. 234 Label locked; 235 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 236 Label slow_path; 237 238 if (UseObjectMonitorTable) { 239 // Clear cache in case fast locking succeeds or we need to take the slow-path. 240 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 241 } 242 243 if (DiagnoseSyncOnValueBasedClasses != 0) { 244 load_klass(rax_reg, obj, t); 245 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 246 jcc(Assembler::notZero, slow_path); 247 } 248 249 const Register mark = t; 250 251 { // Fast Lock 252 253 Label push; 254 255 const Register top = UseObjectMonitorTable ? rax_reg : box; 256 257 // Load the mark. 258 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 259 260 // Prefetch top. 261 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 262 263 // Check for monitor (0b10). 264 testptr(mark, markWord::monitor_value); 265 jcc(Assembler::notZero, inflated); 266 267 // Check if lock-stack is full. 268 cmpl(top, LockStack::end_offset() - 1); 269 jcc(Assembler::greater, slow_path); 270 271 // Check if recursive. 272 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 273 jccb(Assembler::equal, push); 274 275 // Try to lock. Transition lock bits 0b01 => 0b00 276 movptr(rax_reg, mark); 277 orptr(rax_reg, markWord::unlocked_value); 278 andptr(mark, ~(int32_t)markWord::unlocked_value); 279 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 280 jcc(Assembler::notEqual, slow_path); 281 282 if (UseObjectMonitorTable) { 283 // Need to reload top, clobbered by CAS. 284 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 285 } 286 bind(push); 287 // After successful lock, push object on lock-stack. 288 movptr(Address(thread, top), obj); 289 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 290 jmp(locked); 291 } 292 293 { // Handle inflated monitor. 294 bind(inflated); 295 296 const Register monitor = t; 297 298 if (!UseObjectMonitorTable) { 299 assert(mark == monitor, "should be the same here"); 300 } else { 301 const Register hash = t; 302 Label monitor_found; 303 304 // Look for the monitor in the om_cache. 305 306 ByteSize cache_offset = JavaThread::om_cache_oops_offset(); 307 ByteSize monitor_offset = OMCache::oop_to_monitor_difference(); 308 const int num_unrolled = OMCache::CAPACITY; 309 for (int i = 0; i < num_unrolled; i++) { 310 movptr(monitor, Address(thread, cache_offset + monitor_offset)); 311 cmpptr(obj, Address(thread, cache_offset)); 312 jccb(Assembler::equal, monitor_found); 313 cache_offset = cache_offset + OMCache::oop_to_oop_difference(); 314 } 315 316 // Look for the monitor in the table. 317 318 // Get the hash code. 319 movptr(hash, Address(obj, oopDesc::mark_offset_in_bytes())); 320 shrq(hash, markWord::hash_shift); 321 andq(hash, markWord::hash_mask); 322 323 // Get the table and calculate the bucket's address. 324 lea(rax_reg, ExternalAddress(ObjectMonitorTable::current_table_address())); 325 movptr(rax_reg, Address(rax_reg)); 326 andq(hash, Address(rax_reg, ObjectMonitorTable::table_capacity_mask_offset())); 327 movptr(rax_reg, Address(rax_reg, ObjectMonitorTable::table_buckets_offset())); 328 329 // Read the monitor from the bucket. 330 movptr(monitor, Address(rax_reg, hash, Address::times_ptr)); 331 332 // Check if the monitor in the bucket is special (empty, tombstone or removed) 333 cmpptr(monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special); 334 jcc(Assembler::below, slow_path); 335 336 // Check if object matches. 337 movptr(rax_reg, Address(monitor, ObjectMonitor::object_offset())); 338 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 339 bs_asm->try_peek_weak_handle_in_nmethod(this, rax_reg, rax_reg, slow_path); 340 cmpptr(rax_reg, obj); 341 jcc(Assembler::notEqual, slow_path); 342 343 bind(monitor_found); 344 } 345 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 346 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 347 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 348 349 Label monitor_locked; 350 // Lock the monitor. 351 352 if (UseObjectMonitorTable) { 353 // Cache the monitor for unlock before trashing box. On failure to acquire 354 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 355 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 356 } 357 358 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 359 xorptr(rax_reg, rax_reg); 360 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset())); 361 lock(); cmpxchgptr(box, owner_address); 362 jccb(Assembler::equal, monitor_locked); 363 364 // Check if recursive. 365 cmpptr(box, rax_reg); 366 jccb(Assembler::notEqual, slow_path); 367 368 // Recursive. 369 increment(recursions_address); 370 371 bind(monitor_locked); 372 } 373 374 bind(locked); 375 // Set ZF = 1 376 xorl(rax_reg, rax_reg); 377 378 #ifdef ASSERT 379 // Check that locked label is reached with ZF set. 380 Label zf_correct; 381 Label zf_bad_zero; 382 jcc(Assembler::zero, zf_correct); 383 jmp(zf_bad_zero); 384 #endif 385 386 bind(slow_path); 387 #ifdef ASSERT 388 // Check that slow_path label is reached with ZF not set. 389 jcc(Assembler::notZero, zf_correct); 390 stop("Fast Lock ZF != 0"); 391 bind(zf_bad_zero); 392 stop("Fast Lock ZF != 1"); 393 bind(zf_correct); 394 #endif 395 // C2 uses the value of ZF to determine the continuation. 396 } 397 398 // obj: object to lock 399 // rax: tmp -- KILLED 400 // t : tmp - cannot be obj nor rax -- KILLED 401 // 402 // Some commentary on balanced locking: 403 // 404 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 405 // Methods that don't have provably balanced locking are forced to run in the 406 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 407 // The interpreter provides two properties: 408 // I1: At return-time the interpreter automatically and quietly unlocks any 409 // objects acquired in the current activation (frame). Recall that the 410 // interpreter maintains an on-stack list of locks currently held by 411 // a frame. 412 // I2: If a method attempts to unlock an object that is not held by the 413 // frame the interpreter throws IMSX. 414 // 415 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 416 // B() doesn't have provably balanced locking so it runs in the interpreter. 417 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 418 // is still locked by A(). 419 // 420 // The only other source of unbalanced locking would be JNI. The "Java Native Interface 421 // Specification" states that an object locked by JNI's MonitorEnter should not be 422 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't 423 // specify what will occur if a program engages in such mixed-mode locking, however. 424 // Arguably given that the spec legislates the JNI case as undefined our implementation 425 // could reasonably *avoid* checking owner in fast_unlock(). 426 // In the interest of performance we elide m->Owner==Self check in unlock. 427 // A perfectly viable alternative is to elide the owner check except when 428 // Xcheck:jni is enabled. 429 430 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) { 431 assert(reg_rax == rax, "Used for CAS"); 432 assert_different_registers(obj, reg_rax, t); 433 434 // Handle inflated monitor. 435 Label inflated, inflated_check_lock_stack; 436 // Finish fast unlock successfully. MUST jump with ZF == 1 437 Label unlocked, slow_path; 438 439 const Register mark = t; 440 const Register monitor = t; 441 const Register top = UseObjectMonitorTable ? t : reg_rax; 442 const Register box = reg_rax; 443 444 Label dummy; 445 C2FastUnlockStub* stub = nullptr; 446 447 if (!Compile::current()->output()->in_scratch_emit_size()) { 448 stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread); 449 Compile::current()->output()->add_stub(stub); 450 } 451 452 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 453 454 { // Fast Unlock 455 456 // Load top. 457 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 458 459 if (!UseObjectMonitorTable) { 460 // Prefetch mark. 461 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 462 } 463 464 // Check if obj is top of lock-stack. 465 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 466 // Top of lock stack was not obj. Must be monitor. 467 jcc(Assembler::notEqual, inflated_check_lock_stack); 468 469 // Pop lock-stack. 470 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 471 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 472 473 // Check if recursive. 474 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 475 jcc(Assembler::equal, unlocked); 476 477 // We elide the monitor check, let the CAS fail instead. 478 479 if (UseObjectMonitorTable) { 480 // Load mark. 481 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 482 } 483 484 // Try to unlock. Transition lock bits 0b00 => 0b01 485 movptr(reg_rax, mark); 486 andptr(reg_rax, ~(int32_t)markWord::lock_mask_in_place); 487 orptr(mark, markWord::unlocked_value); 488 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 489 jcc(Assembler::notEqual, push_and_slow_path); 490 jmp(unlocked); 491 } 492 493 494 { // Handle inflated monitor. 495 bind(inflated_check_lock_stack); 496 #ifdef ASSERT 497 Label check_done; 498 subl(top, oopSize); 499 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 500 jcc(Assembler::below, check_done); 501 cmpptr(obj, Address(thread, top)); 502 jcc(Assembler::notEqual, inflated_check_lock_stack); 503 stop("Fast Unlock lock on stack"); 504 bind(check_done); 505 if (UseObjectMonitorTable) { 506 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 507 } 508 testptr(mark, markWord::monitor_value); 509 jcc(Assembler::notZero, inflated); 510 stop("Fast Unlock not monitor"); 511 #endif 512 513 bind(inflated); 514 515 if (!UseObjectMonitorTable) { 516 assert(mark == monitor, "should be the same here"); 517 } else { 518 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 519 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 520 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 521 cmpptr(monitor, alignof(ObjectMonitor*)); 522 jcc(Assembler::below, slow_path); 523 } 524 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 525 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 526 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 527 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag}; 528 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 529 530 Label recursive; 531 532 // Check if recursive. 533 cmpptr(recursions_address, 0); 534 jcc(Assembler::notZero, recursive); 535 536 // Set owner to null. 537 // Release to satisfy the JMM 538 movptr(owner_address, NULL_WORD); 539 // We need a full fence after clearing owner to avoid stranding. 540 // StoreLoad achieves this. 541 membar(StoreLoad); 542 543 // Check if the entry_list is empty. 544 cmpptr(entry_list_address, NULL_WORD); 545 jcc(Assembler::zero, unlocked); // If so we are done. 546 547 // Check if there is a successor. 548 cmpptr(succ_address, NULL_WORD); 549 jcc(Assembler::notZero, unlocked); // If so we are done. 550 551 // Save the monitor pointer in the current thread, so we can try to 552 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 553 if (!UseObjectMonitorTable) { 554 andptr(monitor, ~(int32_t)markWord::monitor_value); 555 } 556 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 557 558 orl(t, 1); // Fast Unlock ZF = 0 559 jmpb(slow_path); 560 561 // Recursive unlock. 562 bind(recursive); 563 decrement(recursions_address); 564 } 565 566 bind(unlocked); 567 xorl(t, t); // Fast Unlock ZF = 1 568 569 #ifdef ASSERT 570 // Check that unlocked label is reached with ZF set. 571 Label zf_correct; 572 Label zf_bad_zero; 573 jcc(Assembler::zero, zf_correct); 574 jmp(zf_bad_zero); 575 #endif 576 577 bind(slow_path); 578 if (stub != nullptr) { 579 bind(stub->slow_path_continuation()); 580 } 581 #ifdef ASSERT 582 // Check that stub->continuation() label is reached with ZF not set. 583 jcc(Assembler::notZero, zf_correct); 584 stop("Fast Unlock ZF != 0"); 585 bind(zf_bad_zero); 586 stop("Fast Unlock ZF != 1"); 587 bind(zf_correct); 588 #endif 589 // C2 uses the value of ZF to determine the continuation. 590 } 591 592 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) { 593 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi); 594 } 595 596 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) { 597 const int framesize = Compile::current()->output()->frame_size_in_bytes(); 598 masm->movptr(dst, rsp); 599 if (framesize > 2 * wordSize) { 600 masm->addptr(dst, framesize - 2 * wordSize); 601 } 602 } 603 604 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) { 605 if (PreserveFramePointer) { 606 // frame pointer is valid 607 #ifdef ASSERT 608 // Verify frame pointer value in rbp. 609 reconstruct_frame_pointer_helper(this, rtmp); 610 Label L_success; 611 cmpq(rbp, rtmp); 612 jccb(Assembler::equal, L_success); 613 STOP("frame pointer mismatch"); 614 bind(L_success); 615 #endif // ASSERT 616 } else { 617 reconstruct_frame_pointer_helper(this, rbp); 618 } 619 } 620 621 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) { 622 jint lo = t->_lo; 623 jint hi = t->_hi; 624 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi); 625 if (t == TypeInt::INT) { 626 return; 627 } 628 629 BLOCK_COMMENT("CastII {"); 630 Label fail; 631 Label succeed; 632 633 if (lo != min_jint) { 634 cmpl(val, lo); 635 jccb(Assembler::less, fail); 636 } 637 if (hi != max_jint) { 638 cmpl(val, hi); 639 jccb(Assembler::greater, fail); 640 } 641 jmpb(succeed); 642 643 bind(fail); 644 movl(c_rarg0, idx); 645 movl(c_rarg1, val); 646 movl(c_rarg2, lo); 647 movl(c_rarg3, hi); 648 reconstruct_frame_pointer(rscratch1); 649 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range))); 650 hlt(); 651 bind(succeed); 652 BLOCK_COMMENT("} // CastII"); 653 } 654 655 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) { 656 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi); 657 } 658 659 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) { 660 jlong lo = t->_lo; 661 jlong hi = t->_hi; 662 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi); 663 if (t == TypeLong::LONG) { 664 return; 665 } 666 667 BLOCK_COMMENT("CastLL {"); 668 Label fail; 669 Label succeed; 670 671 auto cmp_val = [&](jlong bound) { 672 if (is_simm32(bound)) { 673 cmpq(val, checked_cast<int>(bound)); 674 } else { 675 mov64(tmp, bound); 676 cmpq(val, tmp); 677 } 678 }; 679 680 if (lo != min_jlong) { 681 cmp_val(lo); 682 jccb(Assembler::less, fail); 683 } 684 if (hi != max_jlong) { 685 cmp_val(hi); 686 jccb(Assembler::greater, fail); 687 } 688 jmpb(succeed); 689 690 bind(fail); 691 movl(c_rarg0, idx); 692 movq(c_rarg1, val); 693 mov64(c_rarg2, lo); 694 mov64(c_rarg3, hi); 695 reconstruct_frame_pointer(rscratch1); 696 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range))); 697 hlt(); 698 bind(succeed); 699 BLOCK_COMMENT("} // CastLL"); 700 } 701 702 //------------------------------------------------------------------------------------------- 703 // Generic instructions support for use in .ad files C2 code generation 704 705 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 706 if (dst != src) { 707 movdqu(dst, src); 708 } 709 if (opcode == Op_AbsVD) { 710 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 711 } else { 712 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 713 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 714 } 715 } 716 717 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 718 if (opcode == Op_AbsVD) { 719 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 720 } else { 721 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 722 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 723 } 724 } 725 726 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 727 if (dst != src) { 728 movdqu(dst, src); 729 } 730 if (opcode == Op_AbsVF) { 731 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 732 } else { 733 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 734 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 735 } 736 } 737 738 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 739 if (opcode == Op_AbsVF) { 740 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 741 } else { 742 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 743 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 744 } 745 } 746 747 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 748 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 749 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 750 751 if (opcode == Op_MinV) { 752 if (elem_bt == T_BYTE) { 753 pminsb(dst, src); 754 } else if (elem_bt == T_SHORT) { 755 pminsw(dst, src); 756 } else if (elem_bt == T_INT) { 757 pminsd(dst, src); 758 } else { 759 assert(elem_bt == T_LONG, "required"); 760 assert(tmp == xmm0, "required"); 761 assert_different_registers(dst, src, tmp); 762 movdqu(xmm0, dst); 763 pcmpgtq(xmm0, src); 764 blendvpd(dst, src); // xmm0 as mask 765 } 766 } else { // opcode == Op_MaxV 767 if (elem_bt == T_BYTE) { 768 pmaxsb(dst, src); 769 } else if (elem_bt == T_SHORT) { 770 pmaxsw(dst, src); 771 } else if (elem_bt == T_INT) { 772 pmaxsd(dst, src); 773 } else { 774 assert(elem_bt == T_LONG, "required"); 775 assert(tmp == xmm0, "required"); 776 assert_different_registers(dst, src, tmp); 777 movdqu(xmm0, src); 778 pcmpgtq(xmm0, dst); 779 blendvpd(dst, src); // xmm0 as mask 780 } 781 } 782 } 783 784 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 785 XMMRegister src1, Address src2, int vlen_enc) { 786 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 787 if (opcode == Op_UMinV) { 788 switch(elem_bt) { 789 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 790 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 791 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 792 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 793 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 794 } 795 } else { 796 assert(opcode == Op_UMaxV, "required"); 797 switch(elem_bt) { 798 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 799 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 800 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 801 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 802 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 803 } 804 } 805 } 806 807 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 808 // For optimality, leverage a full vector width of 512 bits 809 // for operations over smaller vector sizes on AVX512 targets. 810 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 811 if (opcode == Op_UMaxV) { 812 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 813 } else { 814 assert(opcode == Op_UMinV, "required"); 815 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 816 } 817 } else { 818 // T1 = -1 819 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 820 // T1 = -1 << 63 821 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 822 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 823 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 824 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 825 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 826 // Mask = T2 > T1 827 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 828 if (opcode == Op_UMaxV) { 829 // Res = Mask ? Src2 : Src1 830 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 831 } else { 832 // Res = Mask ? Src1 : Src2 833 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 834 } 835 } 836 } 837 838 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 839 XMMRegister src1, XMMRegister src2, int vlen_enc) { 840 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 841 if (opcode == Op_UMinV) { 842 switch(elem_bt) { 843 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 844 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 845 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 846 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 847 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 848 } 849 } else { 850 assert(opcode == Op_UMaxV, "required"); 851 switch(elem_bt) { 852 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 853 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 854 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 855 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 856 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 857 } 858 } 859 } 860 861 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 862 XMMRegister dst, XMMRegister src1, XMMRegister src2, 863 int vlen_enc) { 864 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 865 866 if (opcode == Op_MinV) { 867 if (elem_bt == T_BYTE) { 868 vpminsb(dst, src1, src2, vlen_enc); 869 } else if (elem_bt == T_SHORT) { 870 vpminsw(dst, src1, src2, vlen_enc); 871 } else if (elem_bt == T_INT) { 872 vpminsd(dst, src1, src2, vlen_enc); 873 } else { 874 assert(elem_bt == T_LONG, "required"); 875 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 876 vpminsq(dst, src1, src2, vlen_enc); 877 } else { 878 assert_different_registers(dst, src1, src2); 879 vpcmpgtq(dst, src1, src2, vlen_enc); 880 vblendvpd(dst, src1, src2, dst, vlen_enc); 881 } 882 } 883 } else { // opcode == Op_MaxV 884 if (elem_bt == T_BYTE) { 885 vpmaxsb(dst, src1, src2, vlen_enc); 886 } else if (elem_bt == T_SHORT) { 887 vpmaxsw(dst, src1, src2, vlen_enc); 888 } else if (elem_bt == T_INT) { 889 vpmaxsd(dst, src1, src2, vlen_enc); 890 } else { 891 assert(elem_bt == T_LONG, "required"); 892 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 893 vpmaxsq(dst, src1, src2, vlen_enc); 894 } else { 895 assert_different_registers(dst, src1, src2); 896 vpcmpgtq(dst, src1, src2, vlen_enc); 897 vblendvpd(dst, src2, src1, dst, vlen_enc); 898 } 899 } 900 } 901 } 902 903 // Float/Double min max 904 905 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 906 XMMRegister dst, XMMRegister a, XMMRegister b, 907 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 908 int vlen_enc) { 909 assert(UseAVX > 0, "required"); 910 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 911 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 912 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 913 assert_different_registers(a, tmp, atmp, btmp); 914 assert_different_registers(b, tmp, atmp, btmp); 915 916 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 917 bool is_double_word = is_double_word_type(elem_bt); 918 919 /* Note on 'non-obvious' assembly sequence: 920 * 921 * While there are vminps/vmaxps instructions, there are two important differences between hardware 922 * and Java on how they handle floats: 923 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 924 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 925 * 926 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 927 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 928 * (only useful when signs differ, noop otherwise) 929 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 930 931 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 932 * btmp = (b < +0.0) ? a : b 933 * atmp = (b < +0.0) ? b : a 934 * Tmp = Max_Float(atmp , btmp) 935 * Res = (atmp == NaN) ? atmp : Tmp 936 */ 937 938 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 939 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 940 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 941 XMMRegister mask; 942 943 if (!is_double_word && is_min) { 944 mask = a; 945 vblend = &MacroAssembler::vblendvps; 946 vmaxmin = &MacroAssembler::vminps; 947 vcmp = &MacroAssembler::vcmpps; 948 } else if (!is_double_word && !is_min) { 949 mask = b; 950 vblend = &MacroAssembler::vblendvps; 951 vmaxmin = &MacroAssembler::vmaxps; 952 vcmp = &MacroAssembler::vcmpps; 953 } else if (is_double_word && is_min) { 954 mask = a; 955 vblend = &MacroAssembler::vblendvpd; 956 vmaxmin = &MacroAssembler::vminpd; 957 vcmp = &MacroAssembler::vcmppd; 958 } else { 959 assert(is_double_word && !is_min, "sanity"); 960 mask = b; 961 vblend = &MacroAssembler::vblendvpd; 962 vmaxmin = &MacroAssembler::vmaxpd; 963 vcmp = &MacroAssembler::vcmppd; 964 } 965 966 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 967 XMMRegister maxmin, scratch; 968 if (dst == btmp) { 969 maxmin = btmp; 970 scratch = tmp; 971 } else { 972 maxmin = tmp; 973 scratch = btmp; 974 } 975 976 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 977 if (precompute_mask && !is_double_word) { 978 vpsrad(tmp, mask, 32, vlen_enc); 979 mask = tmp; 980 } else if (precompute_mask && is_double_word) { 981 vpxor(tmp, tmp, tmp, vlen_enc); 982 vpcmpgtq(tmp, tmp, mask, vlen_enc); 983 mask = tmp; 984 } 985 986 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 987 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 988 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 989 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 990 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 991 } 992 993 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 994 XMMRegister dst, XMMRegister a, XMMRegister b, 995 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 996 int vlen_enc) { 997 assert(UseAVX > 2, "required"); 998 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 999 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1000 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1001 assert_different_registers(dst, a, atmp, btmp); 1002 assert_different_registers(dst, b, atmp, btmp); 1003 1004 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1005 bool is_double_word = is_double_word_type(elem_bt); 1006 bool merge = true; 1007 1008 if (!is_double_word && is_min) { 1009 evpmovd2m(ktmp, a, vlen_enc); 1010 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1011 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1012 vminps(dst, atmp, btmp, vlen_enc); 1013 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1014 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1015 } else if (!is_double_word && !is_min) { 1016 evpmovd2m(ktmp, b, vlen_enc); 1017 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1018 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1019 vmaxps(dst, atmp, btmp, vlen_enc); 1020 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1021 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1022 } else if (is_double_word && is_min) { 1023 evpmovq2m(ktmp, a, vlen_enc); 1024 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1025 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1026 vminpd(dst, atmp, btmp, vlen_enc); 1027 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1028 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1029 } else { 1030 assert(is_double_word && !is_min, "sanity"); 1031 evpmovq2m(ktmp, b, vlen_enc); 1032 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1033 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1034 vmaxpd(dst, atmp, btmp, vlen_enc); 1035 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1036 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1037 } 1038 } 1039 1040 void C2_MacroAssembler::vminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask, 1041 XMMRegister src1, XMMRegister src2, int vlen_enc) { 1042 assert(opc == Op_MinV || opc == Op_MinReductionV || 1043 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity"); 1044 1045 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN 1046 : AVX10_2_MINMAX_MAX_COMPARE_SIGN; 1047 if (elem_bt == T_FLOAT) { 1048 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc); 1049 } else { 1050 assert(elem_bt == T_DOUBLE, ""); 1051 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc); 1052 } 1053 } 1054 1055 void C2_MacroAssembler::sminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask, 1056 XMMRegister src1, XMMRegister src2) { 1057 assert(opc == Op_MinF || opc == Op_MaxF || 1058 opc == Op_MinD || opc == Op_MaxD, "sanity"); 1059 1060 int imm8 = (opc == Op_MinF || opc == Op_MinD) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN 1061 : AVX10_2_MINMAX_MAX_COMPARE_SIGN; 1062 if (elem_bt == T_FLOAT) { 1063 evminmaxss(dst, mask, src1, src2, true, imm8); 1064 } else { 1065 assert(elem_bt == T_DOUBLE, ""); 1066 evminmaxsd(dst, mask, src1, src2, true, imm8); 1067 } 1068 } 1069 1070 // Float/Double signum 1071 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1072 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1073 1074 Label DONE_LABEL; 1075 1076 // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument 1077 // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases 1078 // If other floating point comparison instructions used, ZF=1 for equal and unordered cases 1079 if (opcode == Op_SignumF) { 1080 if (VM_Version::supports_avx10_2()) { 1081 evucomxss(dst, zero); 1082 jcc(Assembler::negative, DONE_LABEL); 1083 } else { 1084 ucomiss(dst, zero); 1085 jcc(Assembler::equal, DONE_LABEL); 1086 } 1087 movflt(dst, one); 1088 jcc(Assembler::above, DONE_LABEL); 1089 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1090 } else if (opcode == Op_SignumD) { 1091 if (VM_Version::supports_avx10_2()) { 1092 evucomxsd(dst, zero); 1093 jcc(Assembler::negative, DONE_LABEL); 1094 } else { 1095 ucomisd(dst, zero); 1096 jcc(Assembler::equal, DONE_LABEL); 1097 } 1098 movdbl(dst, one); 1099 jcc(Assembler::above, DONE_LABEL); 1100 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1101 } 1102 1103 bind(DONE_LABEL); 1104 } 1105 1106 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1107 if (sign) { 1108 pmovsxbw(dst, src); 1109 } else { 1110 pmovzxbw(dst, src); 1111 } 1112 } 1113 1114 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1115 if (sign) { 1116 vpmovsxbw(dst, src, vector_len); 1117 } else { 1118 vpmovzxbw(dst, src, vector_len); 1119 } 1120 } 1121 1122 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1123 if (sign) { 1124 vpmovsxbd(dst, src, vector_len); 1125 } else { 1126 vpmovzxbd(dst, src, vector_len); 1127 } 1128 } 1129 1130 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1131 if (sign) { 1132 vpmovsxwd(dst, src, vector_len); 1133 } else { 1134 vpmovzxwd(dst, src, vector_len); 1135 } 1136 } 1137 1138 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1139 int shift, int vector_len) { 1140 if (opcode == Op_RotateLeftV) { 1141 if (etype == T_INT) { 1142 evprold(dst, src, shift, vector_len); 1143 } else { 1144 assert(etype == T_LONG, "expected type T_LONG"); 1145 evprolq(dst, src, shift, vector_len); 1146 } 1147 } else { 1148 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1149 if (etype == T_INT) { 1150 evprord(dst, src, shift, vector_len); 1151 } else { 1152 assert(etype == T_LONG, "expected type T_LONG"); 1153 evprorq(dst, src, shift, vector_len); 1154 } 1155 } 1156 } 1157 1158 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1159 XMMRegister shift, int vector_len) { 1160 if (opcode == Op_RotateLeftV) { 1161 if (etype == T_INT) { 1162 evprolvd(dst, src, shift, vector_len); 1163 } else { 1164 assert(etype == T_LONG, "expected type T_LONG"); 1165 evprolvq(dst, src, shift, vector_len); 1166 } 1167 } else { 1168 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1169 if (etype == T_INT) { 1170 evprorvd(dst, src, shift, vector_len); 1171 } else { 1172 assert(etype == T_LONG, "expected type T_LONG"); 1173 evprorvq(dst, src, shift, vector_len); 1174 } 1175 } 1176 } 1177 1178 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1179 if (opcode == Op_RShiftVI) { 1180 psrad(dst, shift); 1181 } else if (opcode == Op_LShiftVI) { 1182 pslld(dst, shift); 1183 } else { 1184 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1185 psrld(dst, shift); 1186 } 1187 } 1188 1189 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1190 switch (opcode) { 1191 case Op_RShiftVI: psrad(dst, shift); break; 1192 case Op_LShiftVI: pslld(dst, shift); break; 1193 case Op_URShiftVI: psrld(dst, shift); break; 1194 1195 default: assert(false, "%s", NodeClassNames[opcode]); 1196 } 1197 } 1198 1199 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1200 if (opcode == Op_RShiftVI) { 1201 vpsrad(dst, nds, shift, vector_len); 1202 } else if (opcode == Op_LShiftVI) { 1203 vpslld(dst, nds, shift, vector_len); 1204 } else { 1205 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1206 vpsrld(dst, nds, shift, vector_len); 1207 } 1208 } 1209 1210 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1211 switch (opcode) { 1212 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1213 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1214 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1215 1216 default: assert(false, "%s", NodeClassNames[opcode]); 1217 } 1218 } 1219 1220 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1221 switch (opcode) { 1222 case Op_RShiftVB: // fall-through 1223 case Op_RShiftVS: psraw(dst, shift); break; 1224 1225 case Op_LShiftVB: // fall-through 1226 case Op_LShiftVS: psllw(dst, shift); break; 1227 1228 case Op_URShiftVS: // fall-through 1229 case Op_URShiftVB: psrlw(dst, shift); break; 1230 1231 default: assert(false, "%s", NodeClassNames[opcode]); 1232 } 1233 } 1234 1235 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1236 switch (opcode) { 1237 case Op_RShiftVB: // fall-through 1238 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1239 1240 case Op_LShiftVB: // fall-through 1241 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1242 1243 case Op_URShiftVS: // fall-through 1244 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1245 1246 default: assert(false, "%s", NodeClassNames[opcode]); 1247 } 1248 } 1249 1250 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1251 switch (opcode) { 1252 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1253 case Op_LShiftVL: psllq(dst, shift); break; 1254 case Op_URShiftVL: psrlq(dst, shift); break; 1255 1256 default: assert(false, "%s", NodeClassNames[opcode]); 1257 } 1258 } 1259 1260 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1261 if (opcode == Op_RShiftVL) { 1262 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1263 } else if (opcode == Op_LShiftVL) { 1264 psllq(dst, shift); 1265 } else { 1266 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1267 psrlq(dst, shift); 1268 } 1269 } 1270 1271 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1272 switch (opcode) { 1273 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1274 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1275 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1276 1277 default: assert(false, "%s", NodeClassNames[opcode]); 1278 } 1279 } 1280 1281 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1282 if (opcode == Op_RShiftVL) { 1283 evpsraq(dst, nds, shift, vector_len); 1284 } else if (opcode == Op_LShiftVL) { 1285 vpsllq(dst, nds, shift, vector_len); 1286 } else { 1287 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1288 vpsrlq(dst, nds, shift, vector_len); 1289 } 1290 } 1291 1292 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1293 switch (opcode) { 1294 case Op_RShiftVB: // fall-through 1295 case Op_RShiftVS: // fall-through 1296 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1297 1298 case Op_LShiftVB: // fall-through 1299 case Op_LShiftVS: // fall-through 1300 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1301 1302 case Op_URShiftVB: // fall-through 1303 case Op_URShiftVS: // fall-through 1304 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1305 1306 default: assert(false, "%s", NodeClassNames[opcode]); 1307 } 1308 } 1309 1310 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1311 switch (opcode) { 1312 case Op_RShiftVB: // fall-through 1313 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1314 1315 case Op_LShiftVB: // fall-through 1316 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1317 1318 case Op_URShiftVB: // fall-through 1319 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1320 1321 default: assert(false, "%s", NodeClassNames[opcode]); 1322 } 1323 } 1324 1325 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1326 assert(UseAVX >= 2, "required"); 1327 switch (opcode) { 1328 case Op_RShiftVL: { 1329 if (UseAVX > 2) { 1330 assert(tmp == xnoreg, "not used"); 1331 if (!VM_Version::supports_avx512vl()) { 1332 vlen_enc = Assembler::AVX_512bit; 1333 } 1334 evpsravq(dst, src, shift, vlen_enc); 1335 } else { 1336 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1337 vpsrlvq(dst, src, shift, vlen_enc); 1338 vpsrlvq(tmp, tmp, shift, vlen_enc); 1339 vpxor(dst, dst, tmp, vlen_enc); 1340 vpsubq(dst, dst, tmp, vlen_enc); 1341 } 1342 break; 1343 } 1344 case Op_LShiftVL: { 1345 assert(tmp == xnoreg, "not used"); 1346 vpsllvq(dst, src, shift, vlen_enc); 1347 break; 1348 } 1349 case Op_URShiftVL: { 1350 assert(tmp == xnoreg, "not used"); 1351 vpsrlvq(dst, src, shift, vlen_enc); 1352 break; 1353 } 1354 default: assert(false, "%s", NodeClassNames[opcode]); 1355 } 1356 } 1357 1358 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1359 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1360 assert(opcode == Op_LShiftVB || 1361 opcode == Op_RShiftVB || 1362 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1363 bool sign = (opcode != Op_URShiftVB); 1364 assert(vector_len == 0, "required"); 1365 vextendbd(sign, dst, src, 1); 1366 vpmovzxbd(vtmp, shift, 1); 1367 varshiftd(opcode, dst, dst, vtmp, 1); 1368 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1369 vextracti128_high(vtmp, dst); 1370 vpackusdw(dst, dst, vtmp, 0); 1371 } 1372 1373 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1374 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1375 assert(opcode == Op_LShiftVB || 1376 opcode == Op_RShiftVB || 1377 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1378 bool sign = (opcode != Op_URShiftVB); 1379 int ext_vector_len = vector_len + 1; 1380 vextendbw(sign, dst, src, ext_vector_len); 1381 vpmovzxbw(vtmp, shift, ext_vector_len); 1382 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1383 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1384 if (vector_len == 0) { 1385 vextracti128_high(vtmp, dst); 1386 vpackuswb(dst, dst, vtmp, vector_len); 1387 } else { 1388 vextracti64x4_high(vtmp, dst); 1389 vpackuswb(dst, dst, vtmp, vector_len); 1390 vpermq(dst, dst, 0xD8, vector_len); 1391 } 1392 } 1393 1394 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1395 switch(typ) { 1396 case T_BYTE: 1397 pinsrb(dst, val, idx); 1398 break; 1399 case T_SHORT: 1400 pinsrw(dst, val, idx); 1401 break; 1402 case T_INT: 1403 pinsrd(dst, val, idx); 1404 break; 1405 case T_LONG: 1406 pinsrq(dst, val, idx); 1407 break; 1408 default: 1409 assert(false,"Should not reach here."); 1410 break; 1411 } 1412 } 1413 1414 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1415 switch(typ) { 1416 case T_BYTE: 1417 vpinsrb(dst, src, val, idx); 1418 break; 1419 case T_SHORT: 1420 vpinsrw(dst, src, val, idx); 1421 break; 1422 case T_INT: 1423 vpinsrd(dst, src, val, idx); 1424 break; 1425 case T_LONG: 1426 vpinsrq(dst, src, val, idx); 1427 break; 1428 default: 1429 assert(false,"Should not reach here."); 1430 break; 1431 } 1432 } 1433 1434 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst, 1435 Register base, Register idx_base, 1436 Register mask, Register mask_idx, 1437 Register rtmp, int vlen_enc) { 1438 vpxor(dst, dst, dst, vlen_enc); 1439 if (elem_bt == T_SHORT) { 1440 for (int i = 0; i < 4; i++) { 1441 // dst[i] = mask[i] ? src[idx_base[i]] : 0 1442 Label skip_load; 1443 btq(mask, mask_idx); 1444 jccb(Assembler::carryClear, skip_load); 1445 movl(rtmp, Address(idx_base, i * 4)); 1446 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1447 bind(skip_load); 1448 incq(mask_idx); 1449 } 1450 } else { 1451 assert(elem_bt == T_BYTE, ""); 1452 for (int i = 0; i < 8; i++) { 1453 // dst[i] = mask[i] ? src[idx_base[i]] : 0 1454 Label skip_load; 1455 btq(mask, mask_idx); 1456 jccb(Assembler::carryClear, skip_load); 1457 movl(rtmp, Address(idx_base, i * 4)); 1458 pinsrb(dst, Address(base, rtmp), i); 1459 bind(skip_load); 1460 incq(mask_idx); 1461 } 1462 } 1463 } 1464 1465 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst, 1466 Register base, Register idx_base, 1467 Register rtmp, int vlen_enc) { 1468 vpxor(dst, dst, dst, vlen_enc); 1469 if (elem_bt == T_SHORT) { 1470 for (int i = 0; i < 4; i++) { 1471 // dst[i] = src[idx_base[i]] 1472 movl(rtmp, Address(idx_base, i * 4)); 1473 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1474 } 1475 } else { 1476 assert(elem_bt == T_BYTE, ""); 1477 for (int i = 0; i < 8; i++) { 1478 // dst[i] = src[idx_base[i]] 1479 movl(rtmp, Address(idx_base, i * 4)); 1480 pinsrb(dst, Address(base, rtmp), i); 1481 } 1482 } 1483 } 1484 1485 /* 1486 * Gather using hybrid algorithm, first partially unroll scalar loop 1487 * to accumulate values from gather indices into a quad-word(64bit) slice. 1488 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1489 * permutation to place the slice into appropriate vector lane 1490 * locations in destination vector. Following pseudo code describes the 1491 * algorithm in detail: 1492 * 1493 * DST_VEC = ZERO_VEC 1494 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1495 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1496 * FOREACH_ITER: 1497 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1498 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1499 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1500 * PERM_INDEX = PERM_INDEX - TWO_VEC 1501 * 1502 * With each iteration, doubleword permute indices (0,1) corresponding 1503 * to gathered quadword gets right shifted by two lane positions. 1504 * 1505 */ 1506 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1507 Register base, Register idx_base, 1508 Register mask, XMMRegister xtmp1, 1509 XMMRegister xtmp2, XMMRegister temp_dst, 1510 Register rtmp, Register mask_idx, 1511 Register length, int vector_len, int vlen_enc) { 1512 Label GATHER8_LOOP; 1513 assert(is_subword_type(elem_ty), ""); 1514 movl(length, vector_len); 1515 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1516 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1517 vallones(xtmp2, vlen_enc); 1518 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1519 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1520 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1521 1522 bind(GATHER8_LOOP); 1523 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1524 if (mask == noreg) { 1525 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc); 1526 } else { 1527 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc); 1528 } 1529 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1530 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1531 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1532 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1533 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1534 vpor(dst, dst, temp_dst, vlen_enc); 1535 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1536 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1537 jcc(Assembler::notEqual, GATHER8_LOOP); 1538 } 1539 1540 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1541 switch(typ) { 1542 case T_INT: 1543 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1544 break; 1545 case T_FLOAT: 1546 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1547 break; 1548 case T_LONG: 1549 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1550 break; 1551 case T_DOUBLE: 1552 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1553 break; 1554 default: 1555 assert(false,"Should not reach here."); 1556 break; 1557 } 1558 } 1559 1560 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1561 switch(typ) { 1562 case T_INT: 1563 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1564 break; 1565 case T_FLOAT: 1566 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1567 break; 1568 case T_LONG: 1569 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1570 break; 1571 case T_DOUBLE: 1572 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1573 break; 1574 default: 1575 assert(false,"Should not reach here."); 1576 break; 1577 } 1578 } 1579 1580 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1581 switch(typ) { 1582 case T_INT: 1583 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1584 break; 1585 case T_FLOAT: 1586 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1587 break; 1588 case T_LONG: 1589 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1590 break; 1591 case T_DOUBLE: 1592 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1593 break; 1594 default: 1595 assert(false,"Should not reach here."); 1596 break; 1597 } 1598 } 1599 1600 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1601 if (vlen_in_bytes <= 16) { 1602 pxor (dst, dst); 1603 psubb(dst, src); 1604 switch (elem_bt) { 1605 case T_BYTE: /* nothing to do */ break; 1606 case T_SHORT: pmovsxbw(dst, dst); break; 1607 case T_INT: pmovsxbd(dst, dst); break; 1608 case T_FLOAT: pmovsxbd(dst, dst); break; 1609 case T_LONG: pmovsxbq(dst, dst); break; 1610 case T_DOUBLE: pmovsxbq(dst, dst); break; 1611 1612 default: assert(false, "%s", type2name(elem_bt)); 1613 } 1614 } else { 1615 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1616 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1617 1618 vpxor (dst, dst, dst, vlen_enc); 1619 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1620 1621 switch (elem_bt) { 1622 case T_BYTE: /* nothing to do */ break; 1623 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1624 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1625 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1626 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1627 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1628 1629 default: assert(false, "%s", type2name(elem_bt)); 1630 } 1631 } 1632 } 1633 1634 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1635 if (novlbwdq) { 1636 vpmovsxbd(xtmp, src, vlen_enc); 1637 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1638 Assembler::eq, true, vlen_enc, noreg); 1639 } else { 1640 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1641 vpsubb(xtmp, xtmp, src, vlen_enc); 1642 evpmovb2m(dst, xtmp, vlen_enc); 1643 } 1644 } 1645 1646 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) { 1647 if (is_integral_type(bt)) { 1648 switch (vlen_in_bytes) { 1649 case 4: movdl(dst, src); break; 1650 case 8: movq(dst, src); break; 1651 case 16: movdqu(dst, src); break; 1652 case 32: vmovdqu(dst, src); break; 1653 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1654 default: ShouldNotReachHere(); 1655 } 1656 } else { 1657 switch (vlen_in_bytes) { 1658 case 4: movflt(dst, src); break; 1659 case 8: movdbl(dst, src); break; 1660 case 16: movups(dst, src); break; 1661 case 32: vmovups(dst, src, Assembler::AVX_256bit); break; 1662 case 64: vmovups(dst, src, Assembler::AVX_512bit); break; 1663 default: ShouldNotReachHere(); 1664 } 1665 } 1666 } 1667 1668 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1669 assert(rscratch != noreg || always_reachable(src), "missing"); 1670 1671 if (reachable(src)) { 1672 load_vector(bt, dst, as_Address(src), vlen_in_bytes); 1673 } else { 1674 lea(rscratch, src); 1675 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes); 1676 } 1677 } 1678 1679 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1680 int vlen_enc = vector_length_encoding(vlen); 1681 if (VM_Version::supports_avx()) { 1682 if (bt == T_LONG) { 1683 if (VM_Version::supports_avx2()) { 1684 vpbroadcastq(dst, src, vlen_enc); 1685 } else { 1686 vmovddup(dst, src, vlen_enc); 1687 } 1688 } else if (bt == T_DOUBLE) { 1689 if (vlen_enc != Assembler::AVX_128bit) { 1690 vbroadcastsd(dst, src, vlen_enc, noreg); 1691 } else { 1692 vmovddup(dst, src, vlen_enc); 1693 } 1694 } else { 1695 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1696 vpbroadcastd(dst, src, vlen_enc); 1697 } else { 1698 vbroadcastss(dst, src, vlen_enc); 1699 } 1700 } 1701 } else if (VM_Version::supports_sse3()) { 1702 movddup(dst, src); 1703 } else { 1704 load_vector(bt, dst, src, vlen); 1705 } 1706 } 1707 1708 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1709 int entry_idx = vector_iota_entry_index(bt); 1710 ExternalAddress addr(StubRoutines::x86::vector_iota_indices(entry_idx)); 1711 load_vector(T_BYTE, dst, addr, vlen_in_bytes); 1712 } 1713 1714 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1715 1716 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1717 int vector_len = Assembler::AVX_128bit; 1718 1719 switch (opcode) { 1720 case Op_AndReductionV: pand(dst, src); break; 1721 case Op_OrReductionV: por (dst, src); break; 1722 case Op_XorReductionV: pxor(dst, src); break; 1723 case Op_MinReductionV: 1724 switch (typ) { 1725 case T_BYTE: pminsb(dst, src); break; 1726 case T_SHORT: pminsw(dst, src); break; 1727 case T_INT: pminsd(dst, src); break; 1728 case T_LONG: assert(UseAVX > 2, "required"); 1729 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1730 default: assert(false, "wrong type"); 1731 } 1732 break; 1733 case Op_MaxReductionV: 1734 switch (typ) { 1735 case T_BYTE: pmaxsb(dst, src); break; 1736 case T_SHORT: pmaxsw(dst, src); break; 1737 case T_INT: pmaxsd(dst, src); break; 1738 case T_LONG: assert(UseAVX > 2, "required"); 1739 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1740 default: assert(false, "wrong type"); 1741 } 1742 break; 1743 case Op_UMinReductionV: 1744 switch (typ) { 1745 case T_BYTE: vpminub(dst, dst, src, Assembler::AVX_128bit); break; 1746 case T_SHORT: vpminuw(dst, dst, src, Assembler::AVX_128bit); break; 1747 case T_INT: vpminud(dst, dst, src, Assembler::AVX_128bit); break; 1748 case T_LONG: evpminuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break; 1749 default: assert(false, "wrong type"); 1750 } 1751 break; 1752 case Op_UMaxReductionV: 1753 switch (typ) { 1754 case T_BYTE: vpmaxub(dst, dst, src, Assembler::AVX_128bit); break; 1755 case T_SHORT: vpmaxuw(dst, dst, src, Assembler::AVX_128bit); break; 1756 case T_INT: vpmaxud(dst, dst, src, Assembler::AVX_128bit); break; 1757 case T_LONG: evpmaxuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break; 1758 default: assert(false, "wrong type"); 1759 } 1760 break; 1761 case Op_AddReductionVF: addss(dst, src); break; 1762 case Op_AddReductionVD: addsd(dst, src); break; 1763 case Op_AddReductionVI: 1764 switch (typ) { 1765 case T_BYTE: paddb(dst, src); break; 1766 case T_SHORT: paddw(dst, src); break; 1767 case T_INT: paddd(dst, src); break; 1768 default: assert(false, "wrong type"); 1769 } 1770 break; 1771 case Op_AddReductionVL: paddq(dst, src); break; 1772 case Op_MulReductionVF: mulss(dst, src); break; 1773 case Op_MulReductionVD: mulsd(dst, src); break; 1774 case Op_MulReductionVI: 1775 switch (typ) { 1776 case T_SHORT: pmullw(dst, src); break; 1777 case T_INT: pmulld(dst, src); break; 1778 default: assert(false, "wrong type"); 1779 } 1780 break; 1781 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1782 evpmullq(dst, dst, src, vector_len); break; 1783 default: assert(false, "wrong opcode"); 1784 } 1785 } 1786 1787 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1788 switch (opcode) { 1789 case Op_AddReductionVF: addps(dst, src); break; 1790 case Op_AddReductionVD: addpd(dst, src); break; 1791 case Op_MulReductionVF: mulps(dst, src); break; 1792 case Op_MulReductionVD: mulpd(dst, src); break; 1793 default: assert(false, "%s", NodeClassNames[opcode]); 1794 } 1795 } 1796 1797 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1798 int vector_len = Assembler::AVX_256bit; 1799 1800 switch (opcode) { 1801 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1802 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1803 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1804 case Op_MinReductionV: 1805 switch (typ) { 1806 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1807 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1808 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1809 case T_LONG: assert(UseAVX > 2, "required"); 1810 vpminsq(dst, src1, src2, vector_len); break; 1811 default: assert(false, "wrong type"); 1812 } 1813 break; 1814 case Op_MaxReductionV: 1815 switch (typ) { 1816 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1817 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1818 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1819 case T_LONG: assert(UseAVX > 2, "required"); 1820 vpmaxsq(dst, src1, src2, vector_len); break; 1821 default: assert(false, "wrong type"); 1822 } 1823 break; 1824 case Op_UMinReductionV: 1825 switch (typ) { 1826 case T_BYTE: vpminub(dst, src1, src2, vector_len); break; 1827 case T_SHORT: vpminuw(dst, src1, src2, vector_len); break; 1828 case T_INT: vpminud(dst, src1, src2, vector_len); break; 1829 case T_LONG: evpminuq(dst, k0, src1, src2, true, vector_len); break; 1830 default: assert(false, "wrong type"); 1831 } 1832 break; 1833 case Op_UMaxReductionV: 1834 switch (typ) { 1835 case T_BYTE: vpmaxub(dst, src1, src2, vector_len); break; 1836 case T_SHORT: vpmaxuw(dst, src1, src2, vector_len); break; 1837 case T_INT: vpmaxud(dst, src1, src2, vector_len); break; 1838 case T_LONG: evpmaxuq(dst, k0, src1, src2, true, vector_len); break; 1839 default: assert(false, "wrong type"); 1840 } 1841 break; 1842 case Op_AddReductionVI: 1843 switch (typ) { 1844 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1845 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1846 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1847 default: assert(false, "wrong type"); 1848 } 1849 break; 1850 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1851 case Op_MulReductionVI: 1852 switch (typ) { 1853 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1854 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1855 default: assert(false, "wrong type"); 1856 } 1857 break; 1858 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1859 default: assert(false, "wrong opcode"); 1860 } 1861 } 1862 1863 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1864 int vector_len = Assembler::AVX_256bit; 1865 1866 switch (opcode) { 1867 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1868 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1869 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1870 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1871 default: assert(false, "%s", NodeClassNames[opcode]); 1872 } 1873 } 1874 1875 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1876 XMMRegister dst, XMMRegister src, 1877 XMMRegister vtmp1, XMMRegister vtmp2) { 1878 switch (opcode) { 1879 case Op_AddReductionVF: 1880 case Op_MulReductionVF: 1881 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1882 break; 1883 1884 case Op_AddReductionVD: 1885 case Op_MulReductionVD: 1886 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1887 break; 1888 1889 default: assert(false, "wrong opcode"); 1890 } 1891 } 1892 1893 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1894 XMMRegister dst, XMMRegister src, 1895 XMMRegister vtmp1, XMMRegister vtmp2) { 1896 switch (opcode) { 1897 case Op_AddReductionVF: 1898 case Op_MulReductionVF: 1899 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1900 break; 1901 1902 case Op_AddReductionVD: 1903 case Op_MulReductionVD: 1904 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1905 break; 1906 1907 default: assert(false, "%s", NodeClassNames[opcode]); 1908 } 1909 } 1910 1911 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1912 Register dst, Register src1, XMMRegister src2, 1913 XMMRegister vtmp1, XMMRegister vtmp2) { 1914 switch (vlen) { 1915 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1916 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1917 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1918 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1919 1920 default: assert(false, "wrong vector length"); 1921 } 1922 } 1923 1924 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1925 Register dst, Register src1, XMMRegister src2, 1926 XMMRegister vtmp1, XMMRegister vtmp2) { 1927 switch (vlen) { 1928 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1929 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1930 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1931 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1932 1933 default: assert(false, "wrong vector length"); 1934 } 1935 } 1936 1937 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1938 Register dst, Register src1, XMMRegister src2, 1939 XMMRegister vtmp1, XMMRegister vtmp2) { 1940 switch (vlen) { 1941 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1942 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1943 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1944 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1945 1946 default: assert(false, "wrong vector length"); 1947 } 1948 } 1949 1950 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1951 Register dst, Register src1, XMMRegister src2, 1952 XMMRegister vtmp1, XMMRegister vtmp2) { 1953 switch (vlen) { 1954 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1955 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1956 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1957 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1958 1959 default: assert(false, "wrong vector length"); 1960 } 1961 } 1962 1963 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1964 Register dst, Register src1, XMMRegister src2, 1965 XMMRegister vtmp1, XMMRegister vtmp2) { 1966 switch (vlen) { 1967 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1968 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1969 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1970 1971 default: assert(false, "wrong vector length"); 1972 } 1973 } 1974 1975 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1976 switch (vlen) { 1977 case 2: 1978 assert(vtmp2 == xnoreg, ""); 1979 reduce2F(opcode, dst, src, vtmp1); 1980 break; 1981 case 4: 1982 assert(vtmp2 == xnoreg, ""); 1983 reduce4F(opcode, dst, src, vtmp1); 1984 break; 1985 case 8: 1986 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1987 break; 1988 case 16: 1989 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1990 break; 1991 default: assert(false, "wrong vector length"); 1992 } 1993 } 1994 1995 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1996 switch (vlen) { 1997 case 2: 1998 assert(vtmp2 == xnoreg, ""); 1999 reduce2D(opcode, dst, src, vtmp1); 2000 break; 2001 case 4: 2002 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2003 break; 2004 case 8: 2005 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2006 break; 2007 default: assert(false, "wrong vector length"); 2008 } 2009 } 2010 2011 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2012 switch (vlen) { 2013 case 2: 2014 assert(vtmp1 == xnoreg, ""); 2015 assert(vtmp2 == xnoreg, ""); 2016 unorderedReduce2F(opcode, dst, src); 2017 break; 2018 case 4: 2019 assert(vtmp2 == xnoreg, ""); 2020 unorderedReduce4F(opcode, dst, src, vtmp1); 2021 break; 2022 case 8: 2023 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2024 break; 2025 case 16: 2026 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2027 break; 2028 default: assert(false, "wrong vector length"); 2029 } 2030 } 2031 2032 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2033 switch (vlen) { 2034 case 2: 2035 assert(vtmp1 == xnoreg, ""); 2036 assert(vtmp2 == xnoreg, ""); 2037 unorderedReduce2D(opcode, dst, src); 2038 break; 2039 case 4: 2040 assert(vtmp2 == xnoreg, ""); 2041 unorderedReduce4D(opcode, dst, src, vtmp1); 2042 break; 2043 case 8: 2044 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2045 break; 2046 default: assert(false, "wrong vector length"); 2047 } 2048 } 2049 2050 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2051 if (opcode == Op_AddReductionVI) { 2052 if (vtmp1 != src2) { 2053 movdqu(vtmp1, src2); 2054 } 2055 phaddd(vtmp1, vtmp1); 2056 } else { 2057 pshufd(vtmp1, src2, 0x1); 2058 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2059 } 2060 movdl(vtmp2, src1); 2061 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2062 movdl(dst, vtmp1); 2063 } 2064 2065 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2066 if (opcode == Op_AddReductionVI) { 2067 if (vtmp1 != src2) { 2068 movdqu(vtmp1, src2); 2069 } 2070 phaddd(vtmp1, src2); 2071 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2072 } else { 2073 pshufd(vtmp2, src2, 0xE); 2074 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2075 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2076 } 2077 } 2078 2079 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2080 if (opcode == Op_AddReductionVI) { 2081 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2082 vextracti128_high(vtmp2, vtmp1); 2083 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2084 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2085 } else { 2086 vextracti128_high(vtmp1, src2); 2087 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2088 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2089 } 2090 } 2091 2092 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2093 vextracti64x4_high(vtmp2, src2); 2094 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2095 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2096 } 2097 2098 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2099 pshufd(vtmp2, src2, 0x1); 2100 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2101 movdqu(vtmp1, vtmp2); 2102 psrldq(vtmp1, 2); 2103 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2104 movdqu(vtmp2, vtmp1); 2105 psrldq(vtmp2, 1); 2106 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2107 movdl(vtmp2, src1); 2108 if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) { 2109 pmovzxbd(vtmp1, vtmp1); 2110 } else { 2111 pmovsxbd(vtmp1, vtmp1); 2112 } 2113 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2114 pextrb(dst, vtmp1, 0x0); 2115 movsbl(dst, dst); 2116 } 2117 2118 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2119 pshufd(vtmp1, src2, 0xE); 2120 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2121 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2122 } 2123 2124 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2125 vextracti128_high(vtmp2, src2); 2126 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2127 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2128 } 2129 2130 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2131 vextracti64x4_high(vtmp1, src2); 2132 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2133 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2134 } 2135 2136 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2137 pmovsxbw(vtmp2, src2); 2138 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2139 } 2140 2141 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2142 if (UseAVX > 1) { 2143 int vector_len = Assembler::AVX_256bit; 2144 vpmovsxbw(vtmp1, src2, vector_len); 2145 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2146 } else { 2147 pmovsxbw(vtmp2, src2); 2148 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2149 pshufd(vtmp2, src2, 0xe); 2150 pmovsxbw(vtmp2, vtmp2); 2151 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2152 } 2153 } 2154 2155 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2156 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2157 int vector_len = Assembler::AVX_512bit; 2158 vpmovsxbw(vtmp1, src2, vector_len); 2159 reduce32S(opcode, dst, src1, vtmp1, vtmp2, vtmp1); 2160 } else { 2161 assert(UseAVX >= 2,"Should not reach here."); 2162 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2163 vextracti128_high(vtmp2, src2); 2164 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2165 } 2166 } 2167 2168 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2169 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2170 vextracti64x4_high(vtmp2, src2); 2171 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2172 } 2173 2174 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2175 if (opcode == Op_AddReductionVI) { 2176 if (vtmp1 != src2) { 2177 movdqu(vtmp1, src2); 2178 } 2179 phaddw(vtmp1, vtmp1); 2180 phaddw(vtmp1, vtmp1); 2181 } else { 2182 pshufd(vtmp2, src2, 0x1); 2183 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2184 movdqu(vtmp1, vtmp2); 2185 psrldq(vtmp1, 2); 2186 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2187 } 2188 movdl(vtmp2, src1); 2189 if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) { 2190 pmovzxwd(vtmp1, vtmp1); 2191 } else { 2192 pmovsxwd(vtmp1, vtmp1); 2193 } 2194 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2195 pextrw(dst, vtmp1, 0x0); 2196 movswl(dst, dst); 2197 } 2198 2199 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2200 if (opcode == Op_AddReductionVI) { 2201 if (vtmp1 != src2) { 2202 movdqu(vtmp1, src2); 2203 } 2204 phaddw(vtmp1, src2); 2205 } else { 2206 assert_different_registers(src2, vtmp1); 2207 pshufd(vtmp1, src2, 0xE); 2208 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2209 } 2210 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2211 } 2212 2213 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2214 if (opcode == Op_AddReductionVI) { 2215 int vector_len = Assembler::AVX_256bit; 2216 vphaddw(vtmp2, src2, src2, vector_len); 2217 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2218 } else { 2219 assert_different_registers(src2, vtmp2); 2220 vextracti128_high(vtmp2, src2); 2221 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2222 } 2223 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2224 } 2225 2226 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2227 assert_different_registers(src2, vtmp1); 2228 int vector_len = Assembler::AVX_256bit; 2229 vextracti64x4_high(vtmp1, src2); 2230 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2231 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2232 } 2233 2234 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2235 pshufd(vtmp2, src2, 0xE); 2236 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2237 movdq(vtmp1, src1); 2238 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2239 movdq(dst, vtmp1); 2240 } 2241 2242 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2243 vextracti128_high(vtmp1, src2); 2244 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2245 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2246 } 2247 2248 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2249 vextracti64x4_high(vtmp2, src2); 2250 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2251 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2252 } 2253 2254 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2255 mov64(temp, -1L); 2256 bzhiq(temp, temp, len); 2257 kmovql(dst, temp); 2258 } 2259 2260 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2261 reduce_operation_128(T_FLOAT, opcode, dst, src); 2262 pshufd(vtmp, src, 0x1); 2263 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2264 } 2265 2266 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2267 reduce2F(opcode, dst, src, vtmp); 2268 pshufd(vtmp, src, 0x2); 2269 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2270 pshufd(vtmp, src, 0x3); 2271 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2272 } 2273 2274 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2275 reduce4F(opcode, dst, src, vtmp2); 2276 vextractf128_high(vtmp2, src); 2277 reduce4F(opcode, dst, vtmp2, vtmp1); 2278 } 2279 2280 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2281 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2282 vextracti64x4_high(vtmp1, src); 2283 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2284 } 2285 2286 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2287 pshufd(dst, src, 0x1); 2288 reduce_operation_128(T_FLOAT, opcode, dst, src); 2289 } 2290 2291 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2292 pshufd(vtmp, src, 0xE); 2293 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2294 unorderedReduce2F(opcode, dst, vtmp); 2295 } 2296 2297 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2298 vextractf128_high(vtmp1, src); 2299 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2300 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2301 } 2302 2303 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2304 vextractf64x4_high(vtmp2, src); 2305 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2306 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2307 } 2308 2309 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2310 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2311 pshufd(vtmp, src, 0xE); 2312 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2313 } 2314 2315 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2316 reduce2D(opcode, dst, src, vtmp2); 2317 vextractf128_high(vtmp2, src); 2318 reduce2D(opcode, dst, vtmp2, vtmp1); 2319 } 2320 2321 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2322 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2323 vextracti64x4_high(vtmp1, src); 2324 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2325 } 2326 2327 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2328 pshufd(dst, src, 0xE); 2329 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2330 } 2331 2332 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2333 vextractf128_high(vtmp, src); 2334 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2335 unorderedReduce2D(opcode, dst, vtmp); 2336 } 2337 2338 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2339 vextractf64x4_high(vtmp2, src); 2340 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2341 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2342 } 2343 2344 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2345 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2346 } 2347 2348 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2349 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2350 } 2351 2352 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2353 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2354 } 2355 2356 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2357 int vec_enc) { 2358 switch(elem_bt) { 2359 case T_INT: 2360 case T_FLOAT: 2361 vmaskmovps(dst, src, mask, vec_enc); 2362 break; 2363 case T_LONG: 2364 case T_DOUBLE: 2365 vmaskmovpd(dst, src, mask, vec_enc); 2366 break; 2367 default: 2368 fatal("Unsupported type %s", type2name(elem_bt)); 2369 break; 2370 } 2371 } 2372 2373 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2374 int vec_enc) { 2375 switch(elem_bt) { 2376 case T_INT: 2377 case T_FLOAT: 2378 vmaskmovps(dst, src, mask, vec_enc); 2379 break; 2380 case T_LONG: 2381 case T_DOUBLE: 2382 vmaskmovpd(dst, src, mask, vec_enc); 2383 break; 2384 default: 2385 fatal("Unsupported type %s", type2name(elem_bt)); 2386 break; 2387 } 2388 } 2389 2390 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2391 XMMRegister dst, XMMRegister src, 2392 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2393 XMMRegister xmm_0, XMMRegister xmm_1) { 2394 const int permconst[] = {1, 14}; 2395 XMMRegister wsrc = src; 2396 XMMRegister wdst = xmm_0; 2397 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2398 2399 int vlen_enc = Assembler::AVX_128bit; 2400 if (vlen == 16) { 2401 vlen_enc = Assembler::AVX_256bit; 2402 } 2403 2404 for (int i = log2(vlen) - 1; i >=0; i--) { 2405 if (i == 0 && !is_dst_valid) { 2406 wdst = dst; 2407 } 2408 if (i == 3) { 2409 vextracti64x4_high(wtmp, wsrc); 2410 } else if (i == 2) { 2411 vextracti128_high(wtmp, wsrc); 2412 } else { // i = [0,1] 2413 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2414 } 2415 2416 if (VM_Version::supports_avx10_2()) { 2417 vminmax_fp_avx10_2(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc); 2418 } else { 2419 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2420 } 2421 wsrc = wdst; 2422 vlen_enc = Assembler::AVX_128bit; 2423 } 2424 if (is_dst_valid) { 2425 if (VM_Version::supports_avx10_2()) { 2426 vminmax_fp_avx10_2(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit); 2427 } else { 2428 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2429 } 2430 } 2431 } 2432 2433 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2434 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2435 XMMRegister xmm_0, XMMRegister xmm_1) { 2436 XMMRegister wsrc = src; 2437 XMMRegister wdst = xmm_0; 2438 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2439 int vlen_enc = Assembler::AVX_128bit; 2440 if (vlen == 8) { 2441 vlen_enc = Assembler::AVX_256bit; 2442 } 2443 for (int i = log2(vlen) - 1; i >=0; i--) { 2444 if (i == 0 && !is_dst_valid) { 2445 wdst = dst; 2446 } 2447 if (i == 1) { 2448 vextracti128_high(wtmp, wsrc); 2449 } else if (i == 2) { 2450 vextracti64x4_high(wtmp, wsrc); 2451 } else { 2452 assert(i == 0, "%d", i); 2453 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2454 } 2455 2456 if (VM_Version::supports_avx10_2()) { 2457 vminmax_fp_avx10_2(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc); 2458 } else { 2459 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2460 } 2461 2462 wsrc = wdst; 2463 vlen_enc = Assembler::AVX_128bit; 2464 } 2465 2466 if (is_dst_valid) { 2467 if (VM_Version::supports_avx10_2()) { 2468 vminmax_fp_avx10_2(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit); 2469 } else { 2470 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2471 } 2472 } 2473 } 2474 2475 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2476 switch (bt) { 2477 case T_BYTE: pextrb(dst, src, idx); break; 2478 case T_SHORT: pextrw(dst, src, idx); break; 2479 case T_INT: pextrd(dst, src, idx); break; 2480 case T_LONG: pextrq(dst, src, idx); break; 2481 2482 default: 2483 assert(false,"Should not reach here."); 2484 break; 2485 } 2486 } 2487 2488 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2489 int esize = type2aelembytes(typ); 2490 int elem_per_lane = 16/esize; 2491 int lane = elemindex / elem_per_lane; 2492 int eindex = elemindex % elem_per_lane; 2493 2494 if (lane >= 2) { 2495 assert(UseAVX > 2, "required"); 2496 vextractf32x4(dst, src, lane & 3); 2497 return dst; 2498 } else if (lane > 0) { 2499 assert(UseAVX > 0, "required"); 2500 vextractf128(dst, src, lane); 2501 return dst; 2502 } else { 2503 return src; 2504 } 2505 } 2506 2507 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2508 if (typ == T_BYTE) { 2509 movsbl(dst, dst); 2510 } else if (typ == T_SHORT) { 2511 movswl(dst, dst); 2512 } 2513 } 2514 2515 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2516 int esize = type2aelembytes(typ); 2517 int elem_per_lane = 16/esize; 2518 int eindex = elemindex % elem_per_lane; 2519 assert(is_integral_type(typ),"required"); 2520 2521 if (eindex == 0) { 2522 if (typ == T_LONG) { 2523 movq(dst, src); 2524 } else { 2525 movdl(dst, src); 2526 movsxl(typ, dst); 2527 } 2528 } else { 2529 extract(typ, dst, src, eindex); 2530 movsxl(typ, dst); 2531 } 2532 } 2533 2534 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2535 int esize = type2aelembytes(typ); 2536 int elem_per_lane = 16/esize; 2537 int eindex = elemindex % elem_per_lane; 2538 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2539 2540 if (eindex == 0) { 2541 movq(dst, src); 2542 } else { 2543 if (typ == T_FLOAT) { 2544 if (UseAVX == 0) { 2545 movdqu(dst, src); 2546 shufps(dst, dst, eindex); 2547 } else { 2548 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2549 } 2550 } else { 2551 if (UseAVX == 0) { 2552 movdqu(dst, src); 2553 psrldq(dst, eindex*esize); 2554 } else { 2555 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2556 } 2557 movq(dst, dst); 2558 } 2559 } 2560 // Zero upper bits 2561 if (typ == T_FLOAT) { 2562 if (UseAVX == 0) { 2563 assert(vtmp != xnoreg, "required."); 2564 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2565 pand(dst, vtmp); 2566 } else { 2567 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2568 } 2569 } 2570 } 2571 2572 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2573 switch(typ) { 2574 case T_BYTE: 2575 case T_BOOLEAN: 2576 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2577 break; 2578 case T_SHORT: 2579 case T_CHAR: 2580 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2581 break; 2582 case T_INT: 2583 case T_FLOAT: 2584 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2585 break; 2586 case T_LONG: 2587 case T_DOUBLE: 2588 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2589 break; 2590 default: 2591 assert(false,"Should not reach here."); 2592 break; 2593 } 2594 } 2595 2596 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2597 assert(rscratch != noreg || always_reachable(src2), "missing"); 2598 2599 switch(typ) { 2600 case T_BOOLEAN: 2601 case T_BYTE: 2602 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2603 break; 2604 case T_CHAR: 2605 case T_SHORT: 2606 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2607 break; 2608 case T_INT: 2609 case T_FLOAT: 2610 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2611 break; 2612 case T_LONG: 2613 case T_DOUBLE: 2614 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2615 break; 2616 default: 2617 assert(false,"Should not reach here."); 2618 break; 2619 } 2620 } 2621 2622 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2623 switch(typ) { 2624 case T_BYTE: 2625 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2626 break; 2627 case T_SHORT: 2628 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2629 break; 2630 case T_INT: 2631 case T_FLOAT: 2632 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2633 break; 2634 case T_LONG: 2635 case T_DOUBLE: 2636 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2637 break; 2638 default: 2639 assert(false,"Should not reach here."); 2640 break; 2641 } 2642 } 2643 2644 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2645 assert(vlen_in_bytes <= 32, ""); 2646 int esize = type2aelembytes(bt); 2647 if (vlen_in_bytes == 32) { 2648 assert(vtmp == xnoreg, "required."); 2649 if (esize >= 4) { 2650 vtestps(src1, src2, AVX_256bit); 2651 } else { 2652 vptest(src1, src2, AVX_256bit); 2653 } 2654 return; 2655 } 2656 if (vlen_in_bytes < 16) { 2657 // Duplicate the lower part to fill the whole register, 2658 // Don't need to do so for src2 2659 assert(vtmp != xnoreg, "required"); 2660 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2661 pshufd(vtmp, src1, shuffle_imm); 2662 } else { 2663 assert(vtmp == xnoreg, "required"); 2664 vtmp = src1; 2665 } 2666 if (esize >= 4 && VM_Version::supports_avx()) { 2667 vtestps(vtmp, src2, AVX_128bit); 2668 } else { 2669 ptest(vtmp, src2); 2670 } 2671 } 2672 2673 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2674 #ifdef ASSERT 2675 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2676 bool is_bw_supported = VM_Version::supports_avx512bw(); 2677 if (is_bw && !is_bw_supported) { 2678 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2679 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2680 "XMM register should be 0-15"); 2681 } 2682 #endif // ASSERT 2683 switch (elem_bt) { 2684 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2685 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2686 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2687 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2688 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2689 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2690 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2691 } 2692 } 2693 2694 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2695 assert(UseAVX >= 2, "required"); 2696 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2697 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2698 if ((UseAVX > 2) && 2699 (!is_bw || VM_Version::supports_avx512bw()) && 2700 (!is_vl || VM_Version::supports_avx512vl())) { 2701 switch (elem_bt) { 2702 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2703 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2704 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2705 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2706 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2707 } 2708 } else { 2709 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2710 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2711 switch (elem_bt) { 2712 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2713 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2714 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2715 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2716 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2717 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2718 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2719 } 2720 } 2721 } 2722 2723 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2724 switch (to_elem_bt) { 2725 case T_SHORT: 2726 vpmovsxbw(dst, src, vlen_enc); 2727 break; 2728 case T_INT: 2729 vpmovsxbd(dst, src, vlen_enc); 2730 break; 2731 case T_FLOAT: 2732 vpmovsxbd(dst, src, vlen_enc); 2733 vcvtdq2ps(dst, dst, vlen_enc); 2734 break; 2735 case T_LONG: 2736 vpmovsxbq(dst, src, vlen_enc); 2737 break; 2738 case T_DOUBLE: { 2739 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2740 vpmovsxbd(dst, src, mid_vlen_enc); 2741 vcvtdq2pd(dst, dst, vlen_enc); 2742 break; 2743 } 2744 default: 2745 fatal("Unsupported type %s", type2name(to_elem_bt)); 2746 break; 2747 } 2748 } 2749 2750 //------------------------------------------------------------------------------------------- 2751 2752 // IndexOf for constant substrings with size >= 8 chars 2753 // which don't need to be loaded through stack. 2754 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2755 Register cnt1, Register cnt2, 2756 int int_cnt2, Register result, 2757 XMMRegister vec, Register tmp, 2758 int ae) { 2759 ShortBranchVerifier sbv(this); 2760 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2761 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2762 2763 // This method uses the pcmpestri instruction with bound registers 2764 // inputs: 2765 // xmm - substring 2766 // rax - substring length (elements count) 2767 // mem - scanned string 2768 // rdx - string length (elements count) 2769 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2770 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2771 // outputs: 2772 // rcx - matched index in string 2773 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2774 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2775 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2776 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2777 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2778 2779 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2780 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2781 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2782 2783 // Note, inline_string_indexOf() generates checks: 2784 // if (substr.count > string.count) return -1; 2785 // if (substr.count == 0) return 0; 2786 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2787 2788 // Load substring. 2789 if (ae == StrIntrinsicNode::UL) { 2790 pmovzxbw(vec, Address(str2, 0)); 2791 } else { 2792 movdqu(vec, Address(str2, 0)); 2793 } 2794 movl(cnt2, int_cnt2); 2795 movptr(result, str1); // string addr 2796 2797 if (int_cnt2 > stride) { 2798 jmpb(SCAN_TO_SUBSTR); 2799 2800 // Reload substr for rescan, this code 2801 // is executed only for large substrings (> 8 chars) 2802 bind(RELOAD_SUBSTR); 2803 if (ae == StrIntrinsicNode::UL) { 2804 pmovzxbw(vec, Address(str2, 0)); 2805 } else { 2806 movdqu(vec, Address(str2, 0)); 2807 } 2808 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2809 2810 bind(RELOAD_STR); 2811 // We came here after the beginning of the substring was 2812 // matched but the rest of it was not so we need to search 2813 // again. Start from the next element after the previous match. 2814 2815 // cnt2 is number of substring reminding elements and 2816 // cnt1 is number of string reminding elements when cmp failed. 2817 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2818 subl(cnt1, cnt2); 2819 addl(cnt1, int_cnt2); 2820 movl(cnt2, int_cnt2); // Now restore cnt2 2821 2822 decrementl(cnt1); // Shift to next element 2823 cmpl(cnt1, cnt2); 2824 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2825 2826 addptr(result, (1<<scale1)); 2827 2828 } // (int_cnt2 > 8) 2829 2830 // Scan string for start of substr in 16-byte vectors 2831 bind(SCAN_TO_SUBSTR); 2832 pcmpestri(vec, Address(result, 0), mode); 2833 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2834 subl(cnt1, stride); 2835 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2836 cmpl(cnt1, cnt2); 2837 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2838 addptr(result, 16); 2839 jmpb(SCAN_TO_SUBSTR); 2840 2841 // Found a potential substr 2842 bind(FOUND_CANDIDATE); 2843 // Matched whole vector if first element matched (tmp(rcx) == 0). 2844 if (int_cnt2 == stride) { 2845 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2846 } else { // int_cnt2 > 8 2847 jccb(Assembler::overflow, FOUND_SUBSTR); 2848 } 2849 // After pcmpestri tmp(rcx) contains matched element index 2850 // Compute start addr of substr 2851 lea(result, Address(result, tmp, scale1)); 2852 2853 // Make sure string is still long enough 2854 subl(cnt1, tmp); 2855 cmpl(cnt1, cnt2); 2856 if (int_cnt2 == stride) { 2857 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2858 } else { // int_cnt2 > 8 2859 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2860 } 2861 // Left less then substring. 2862 2863 bind(RET_NOT_FOUND); 2864 movl(result, -1); 2865 jmp(EXIT); 2866 2867 if (int_cnt2 > stride) { 2868 // This code is optimized for the case when whole substring 2869 // is matched if its head is matched. 2870 bind(MATCH_SUBSTR_HEAD); 2871 pcmpestri(vec, Address(result, 0), mode); 2872 // Reload only string if does not match 2873 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2874 2875 Label CONT_SCAN_SUBSTR; 2876 // Compare the rest of substring (> 8 chars). 2877 bind(FOUND_SUBSTR); 2878 // First 8 chars are already matched. 2879 negptr(cnt2); 2880 addptr(cnt2, stride); 2881 2882 bind(SCAN_SUBSTR); 2883 subl(cnt1, stride); 2884 cmpl(cnt2, -stride); // Do not read beyond substring 2885 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2886 // Back-up strings to avoid reading beyond substring: 2887 // cnt1 = cnt1 - cnt2 + 8 2888 addl(cnt1, cnt2); // cnt2 is negative 2889 addl(cnt1, stride); 2890 movl(cnt2, stride); negptr(cnt2); 2891 bind(CONT_SCAN_SUBSTR); 2892 if (int_cnt2 < (int)G) { 2893 int tail_off1 = int_cnt2<<scale1; 2894 int tail_off2 = int_cnt2<<scale2; 2895 if (ae == StrIntrinsicNode::UL) { 2896 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2897 } else { 2898 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2899 } 2900 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2901 } else { 2902 // calculate index in register to avoid integer overflow (int_cnt2*2) 2903 movl(tmp, int_cnt2); 2904 addptr(tmp, cnt2); 2905 if (ae == StrIntrinsicNode::UL) { 2906 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2907 } else { 2908 movdqu(vec, Address(str2, tmp, scale2, 0)); 2909 } 2910 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2911 } 2912 // Need to reload strings pointers if not matched whole vector 2913 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2914 addptr(cnt2, stride); 2915 jcc(Assembler::negative, SCAN_SUBSTR); 2916 // Fall through if found full substring 2917 2918 } // (int_cnt2 > 8) 2919 2920 bind(RET_FOUND); 2921 // Found result if we matched full small substring. 2922 // Compute substr offset 2923 subptr(result, str1); 2924 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2925 shrl(result, 1); // index 2926 } 2927 bind(EXIT); 2928 2929 } // string_indexofC8 2930 2931 // Small strings are loaded through stack if they cross page boundary. 2932 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2933 Register cnt1, Register cnt2, 2934 int int_cnt2, Register result, 2935 XMMRegister vec, Register tmp, 2936 int ae) { 2937 ShortBranchVerifier sbv(this); 2938 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2939 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2940 2941 // 2942 // int_cnt2 is length of small (< 8 chars) constant substring 2943 // or (-1) for non constant substring in which case its length 2944 // is in cnt2 register. 2945 // 2946 // Note, inline_string_indexOf() generates checks: 2947 // if (substr.count > string.count) return -1; 2948 // if (substr.count == 0) return 0; 2949 // 2950 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2951 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2952 // This method uses the pcmpestri instruction with bound registers 2953 // inputs: 2954 // xmm - substring 2955 // rax - substring length (elements count) 2956 // mem - scanned string 2957 // rdx - string length (elements count) 2958 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2959 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2960 // outputs: 2961 // rcx - matched index in string 2962 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2963 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2964 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2965 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2966 2967 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2968 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2969 FOUND_CANDIDATE; 2970 2971 { //======================================================== 2972 // We don't know where these strings are located 2973 // and we can't read beyond them. Load them through stack. 2974 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2975 2976 movptr(tmp, rsp); // save old SP 2977 2978 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2979 if (int_cnt2 == (1>>scale2)) { // One byte 2980 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2981 load_unsigned_byte(result, Address(str2, 0)); 2982 movdl(vec, result); // move 32 bits 2983 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2984 // Not enough header space in 32-bit VM: 12+3 = 15. 2985 movl(result, Address(str2, -1)); 2986 shrl(result, 8); 2987 movdl(vec, result); // move 32 bits 2988 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2989 load_unsigned_short(result, Address(str2, 0)); 2990 movdl(vec, result); // move 32 bits 2991 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2992 movdl(vec, Address(str2, 0)); // move 32 bits 2993 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2994 movq(vec, Address(str2, 0)); // move 64 bits 2995 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2996 // Array header size is 12 bytes in 32-bit VM 2997 // + 6 bytes for 3 chars == 18 bytes, 2998 // enough space to load vec and shift. 2999 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3000 if (ae == StrIntrinsicNode::UL) { 3001 int tail_off = int_cnt2-8; 3002 pmovzxbw(vec, Address(str2, tail_off)); 3003 psrldq(vec, -2*tail_off); 3004 } 3005 else { 3006 int tail_off = int_cnt2*(1<<scale2); 3007 movdqu(vec, Address(str2, tail_off-16)); 3008 psrldq(vec, 16-tail_off); 3009 } 3010 } 3011 } else { // not constant substring 3012 cmpl(cnt2, stride); 3013 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3014 3015 // We can read beyond string if srt+16 does not cross page boundary 3016 // since heaps are aligned and mapped by pages. 3017 assert(os::vm_page_size() < (int)G, "default page should be small"); 3018 movl(result, str2); // We need only low 32 bits 3019 andl(result, ((int)os::vm_page_size()-1)); 3020 cmpl(result, ((int)os::vm_page_size()-16)); 3021 jccb(Assembler::belowEqual, CHECK_STR); 3022 3023 // Move small strings to stack to allow load 16 bytes into vec. 3024 subptr(rsp, 16); 3025 int stk_offset = wordSize-(1<<scale2); 3026 push(cnt2); 3027 3028 bind(COPY_SUBSTR); 3029 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3030 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3031 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3032 } else if (ae == StrIntrinsicNode::UU) { 3033 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3034 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3035 } 3036 decrement(cnt2); 3037 jccb(Assembler::notZero, COPY_SUBSTR); 3038 3039 pop(cnt2); 3040 movptr(str2, rsp); // New substring address 3041 } // non constant 3042 3043 bind(CHECK_STR); 3044 cmpl(cnt1, stride); 3045 jccb(Assembler::aboveEqual, BIG_STRINGS); 3046 3047 // Check cross page boundary. 3048 movl(result, str1); // We need only low 32 bits 3049 andl(result, ((int)os::vm_page_size()-1)); 3050 cmpl(result, ((int)os::vm_page_size()-16)); 3051 jccb(Assembler::belowEqual, BIG_STRINGS); 3052 3053 subptr(rsp, 16); 3054 int stk_offset = -(1<<scale1); 3055 if (int_cnt2 < 0) { // not constant 3056 push(cnt2); 3057 stk_offset += wordSize; 3058 } 3059 movl(cnt2, cnt1); 3060 3061 bind(COPY_STR); 3062 if (ae == StrIntrinsicNode::LL) { 3063 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3064 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3065 } else { 3066 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3067 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3068 } 3069 decrement(cnt2); 3070 jccb(Assembler::notZero, COPY_STR); 3071 3072 if (int_cnt2 < 0) { // not constant 3073 pop(cnt2); 3074 } 3075 movptr(str1, rsp); // New string address 3076 3077 bind(BIG_STRINGS); 3078 // Load substring. 3079 if (int_cnt2 < 0) { // -1 3080 if (ae == StrIntrinsicNode::UL) { 3081 pmovzxbw(vec, Address(str2, 0)); 3082 } else { 3083 movdqu(vec, Address(str2, 0)); 3084 } 3085 push(cnt2); // substr count 3086 push(str2); // substr addr 3087 push(str1); // string addr 3088 } else { 3089 // Small (< 8 chars) constant substrings are loaded already. 3090 movl(cnt2, int_cnt2); 3091 } 3092 push(tmp); // original SP 3093 3094 } // Finished loading 3095 3096 //======================================================== 3097 // Start search 3098 // 3099 3100 movptr(result, str1); // string addr 3101 3102 if (int_cnt2 < 0) { // Only for non constant substring 3103 jmpb(SCAN_TO_SUBSTR); 3104 3105 // SP saved at sp+0 3106 // String saved at sp+1*wordSize 3107 // Substr saved at sp+2*wordSize 3108 // Substr count saved at sp+3*wordSize 3109 3110 // Reload substr for rescan, this code 3111 // is executed only for large substrings (> 8 chars) 3112 bind(RELOAD_SUBSTR); 3113 movptr(str2, Address(rsp, 2*wordSize)); 3114 movl(cnt2, Address(rsp, 3*wordSize)); 3115 if (ae == StrIntrinsicNode::UL) { 3116 pmovzxbw(vec, Address(str2, 0)); 3117 } else { 3118 movdqu(vec, Address(str2, 0)); 3119 } 3120 // We came here after the beginning of the substring was 3121 // matched but the rest of it was not so we need to search 3122 // again. Start from the next element after the previous match. 3123 subptr(str1, result); // Restore counter 3124 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3125 shrl(str1, 1); 3126 } 3127 addl(cnt1, str1); 3128 decrementl(cnt1); // Shift to next element 3129 cmpl(cnt1, cnt2); 3130 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3131 3132 addptr(result, (1<<scale1)); 3133 } // non constant 3134 3135 // Scan string for start of substr in 16-byte vectors 3136 bind(SCAN_TO_SUBSTR); 3137 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3138 pcmpestri(vec, Address(result, 0), mode); 3139 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3140 subl(cnt1, stride); 3141 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3142 cmpl(cnt1, cnt2); 3143 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3144 addptr(result, 16); 3145 3146 bind(ADJUST_STR); 3147 cmpl(cnt1, stride); // Do not read beyond string 3148 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3149 // Back-up string to avoid reading beyond string. 3150 lea(result, Address(result, cnt1, scale1, -16)); 3151 movl(cnt1, stride); 3152 jmpb(SCAN_TO_SUBSTR); 3153 3154 // Found a potential substr 3155 bind(FOUND_CANDIDATE); 3156 // After pcmpestri tmp(rcx) contains matched element index 3157 3158 // Make sure string is still long enough 3159 subl(cnt1, tmp); 3160 cmpl(cnt1, cnt2); 3161 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3162 // Left less then substring. 3163 3164 bind(RET_NOT_FOUND); 3165 movl(result, -1); 3166 jmp(CLEANUP); 3167 3168 bind(FOUND_SUBSTR); 3169 // Compute start addr of substr 3170 lea(result, Address(result, tmp, scale1)); 3171 if (int_cnt2 > 0) { // Constant substring 3172 // Repeat search for small substring (< 8 chars) 3173 // from new point without reloading substring. 3174 // Have to check that we don't read beyond string. 3175 cmpl(tmp, stride-int_cnt2); 3176 jccb(Assembler::greater, ADJUST_STR); 3177 // Fall through if matched whole substring. 3178 } else { // non constant 3179 assert(int_cnt2 == -1, "should be != 0"); 3180 3181 addl(tmp, cnt2); 3182 // Found result if we matched whole substring. 3183 cmpl(tmp, stride); 3184 jcc(Assembler::lessEqual, RET_FOUND); 3185 3186 // Repeat search for small substring (<= 8 chars) 3187 // from new point 'str1' without reloading substring. 3188 cmpl(cnt2, stride); 3189 // Have to check that we don't read beyond string. 3190 jccb(Assembler::lessEqual, ADJUST_STR); 3191 3192 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3193 // Compare the rest of substring (> 8 chars). 3194 movptr(str1, result); 3195 3196 cmpl(tmp, cnt2); 3197 // First 8 chars are already matched. 3198 jccb(Assembler::equal, CHECK_NEXT); 3199 3200 bind(SCAN_SUBSTR); 3201 pcmpestri(vec, Address(str1, 0), mode); 3202 // Need to reload strings pointers if not matched whole vector 3203 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3204 3205 bind(CHECK_NEXT); 3206 subl(cnt2, stride); 3207 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3208 addptr(str1, 16); 3209 if (ae == StrIntrinsicNode::UL) { 3210 addptr(str2, 8); 3211 } else { 3212 addptr(str2, 16); 3213 } 3214 subl(cnt1, stride); 3215 cmpl(cnt2, stride); // Do not read beyond substring 3216 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3217 // Back-up strings to avoid reading beyond substring. 3218 3219 if (ae == StrIntrinsicNode::UL) { 3220 lea(str2, Address(str2, cnt2, scale2, -8)); 3221 lea(str1, Address(str1, cnt2, scale1, -16)); 3222 } else { 3223 lea(str2, Address(str2, cnt2, scale2, -16)); 3224 lea(str1, Address(str1, cnt2, scale1, -16)); 3225 } 3226 subl(cnt1, cnt2); 3227 movl(cnt2, stride); 3228 addl(cnt1, stride); 3229 bind(CONT_SCAN_SUBSTR); 3230 if (ae == StrIntrinsicNode::UL) { 3231 pmovzxbw(vec, Address(str2, 0)); 3232 } else { 3233 movdqu(vec, Address(str2, 0)); 3234 } 3235 jmp(SCAN_SUBSTR); 3236 3237 bind(RET_FOUND_LONG); 3238 movptr(str1, Address(rsp, wordSize)); 3239 } // non constant 3240 3241 bind(RET_FOUND); 3242 // Compute substr offset 3243 subptr(result, str1); 3244 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3245 shrl(result, 1); // index 3246 } 3247 bind(CLEANUP); 3248 pop(rsp); // restore SP 3249 3250 } // string_indexof 3251 3252 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3253 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3254 ShortBranchVerifier sbv(this); 3255 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3256 3257 int stride = 8; 3258 3259 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3260 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3261 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3262 FOUND_SEQ_CHAR, DONE_LABEL; 3263 3264 movptr(result, str1); 3265 if (UseAVX >= 2) { 3266 cmpl(cnt1, stride); 3267 jcc(Assembler::less, SCAN_TO_CHAR); 3268 cmpl(cnt1, 2*stride); 3269 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3270 movdl(vec1, ch); 3271 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3272 vpxor(vec2, vec2); 3273 movl(tmp, cnt1); 3274 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3275 andl(cnt1,0x0000000F); //tail count (in chars) 3276 3277 bind(SCAN_TO_16_CHAR_LOOP); 3278 vmovdqu(vec3, Address(result, 0)); 3279 vpcmpeqw(vec3, vec3, vec1, 1); 3280 vptest(vec2, vec3); 3281 jcc(Assembler::carryClear, FOUND_CHAR); 3282 addptr(result, 32); 3283 subl(tmp, 2*stride); 3284 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3285 jmp(SCAN_TO_8_CHAR); 3286 bind(SCAN_TO_8_CHAR_INIT); 3287 movdl(vec1, ch); 3288 pshuflw(vec1, vec1, 0x00); 3289 pshufd(vec1, vec1, 0); 3290 pxor(vec2, vec2); 3291 } 3292 bind(SCAN_TO_8_CHAR); 3293 cmpl(cnt1, stride); 3294 jcc(Assembler::less, SCAN_TO_CHAR); 3295 if (UseAVX < 2) { 3296 movdl(vec1, ch); 3297 pshuflw(vec1, vec1, 0x00); 3298 pshufd(vec1, vec1, 0); 3299 pxor(vec2, vec2); 3300 } 3301 movl(tmp, cnt1); 3302 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3303 andl(cnt1,0x00000007); //tail count (in chars) 3304 3305 bind(SCAN_TO_8_CHAR_LOOP); 3306 movdqu(vec3, Address(result, 0)); 3307 pcmpeqw(vec3, vec1); 3308 ptest(vec2, vec3); 3309 jcc(Assembler::carryClear, FOUND_CHAR); 3310 addptr(result, 16); 3311 subl(tmp, stride); 3312 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3313 bind(SCAN_TO_CHAR); 3314 testl(cnt1, cnt1); 3315 jcc(Assembler::zero, RET_NOT_FOUND); 3316 bind(SCAN_TO_CHAR_LOOP); 3317 load_unsigned_short(tmp, Address(result, 0)); 3318 cmpl(ch, tmp); 3319 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3320 addptr(result, 2); 3321 subl(cnt1, 1); 3322 jccb(Assembler::zero, RET_NOT_FOUND); 3323 jmp(SCAN_TO_CHAR_LOOP); 3324 3325 bind(RET_NOT_FOUND); 3326 movl(result, -1); 3327 jmpb(DONE_LABEL); 3328 3329 bind(FOUND_CHAR); 3330 if (UseAVX >= 2) { 3331 vpmovmskb(tmp, vec3); 3332 } else { 3333 pmovmskb(tmp, vec3); 3334 } 3335 bsfl(ch, tmp); 3336 addptr(result, ch); 3337 3338 bind(FOUND_SEQ_CHAR); 3339 subptr(result, str1); 3340 shrl(result, 1); 3341 3342 bind(DONE_LABEL); 3343 } // string_indexof_char 3344 3345 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3346 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3347 ShortBranchVerifier sbv(this); 3348 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3349 3350 int stride = 16; 3351 3352 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3353 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3354 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3355 FOUND_SEQ_CHAR, DONE_LABEL; 3356 3357 movptr(result, str1); 3358 if (UseAVX >= 2) { 3359 cmpl(cnt1, stride); 3360 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3361 cmpl(cnt1, stride*2); 3362 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3363 movdl(vec1, ch); 3364 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3365 vpxor(vec2, vec2); 3366 movl(tmp, cnt1); 3367 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3368 andl(cnt1,0x0000001F); //tail count (in chars) 3369 3370 bind(SCAN_TO_32_CHAR_LOOP); 3371 vmovdqu(vec3, Address(result, 0)); 3372 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3373 vptest(vec2, vec3); 3374 jcc(Assembler::carryClear, FOUND_CHAR); 3375 addptr(result, 32); 3376 subl(tmp, stride*2); 3377 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3378 jmp(SCAN_TO_16_CHAR); 3379 3380 bind(SCAN_TO_16_CHAR_INIT); 3381 movdl(vec1, ch); 3382 pxor(vec2, vec2); 3383 pshufb(vec1, vec2); 3384 } 3385 3386 bind(SCAN_TO_16_CHAR); 3387 cmpl(cnt1, stride); 3388 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3389 if (UseAVX < 2) { 3390 movdl(vec1, ch); 3391 pxor(vec2, vec2); 3392 pshufb(vec1, vec2); 3393 } 3394 movl(tmp, cnt1); 3395 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3396 andl(cnt1,0x0000000F); //tail count (in bytes) 3397 3398 bind(SCAN_TO_16_CHAR_LOOP); 3399 movdqu(vec3, Address(result, 0)); 3400 pcmpeqb(vec3, vec1); 3401 ptest(vec2, vec3); 3402 jcc(Assembler::carryClear, FOUND_CHAR); 3403 addptr(result, 16); 3404 subl(tmp, stride); 3405 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3406 3407 bind(SCAN_TO_CHAR_INIT); 3408 testl(cnt1, cnt1); 3409 jcc(Assembler::zero, RET_NOT_FOUND); 3410 bind(SCAN_TO_CHAR_LOOP); 3411 load_unsigned_byte(tmp, Address(result, 0)); 3412 cmpl(ch, tmp); 3413 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3414 addptr(result, 1); 3415 subl(cnt1, 1); 3416 jccb(Assembler::zero, RET_NOT_FOUND); 3417 jmp(SCAN_TO_CHAR_LOOP); 3418 3419 bind(RET_NOT_FOUND); 3420 movl(result, -1); 3421 jmpb(DONE_LABEL); 3422 3423 bind(FOUND_CHAR); 3424 if (UseAVX >= 2) { 3425 vpmovmskb(tmp, vec3); 3426 } else { 3427 pmovmskb(tmp, vec3); 3428 } 3429 bsfl(ch, tmp); 3430 addptr(result, ch); 3431 3432 bind(FOUND_SEQ_CHAR); 3433 subptr(result, str1); 3434 3435 bind(DONE_LABEL); 3436 } // stringL_indexof_char 3437 3438 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3439 switch (eltype) { 3440 case T_BOOLEAN: return sizeof(jboolean); 3441 case T_BYTE: return sizeof(jbyte); 3442 case T_SHORT: return sizeof(jshort); 3443 case T_CHAR: return sizeof(jchar); 3444 case T_INT: return sizeof(jint); 3445 default: 3446 ShouldNotReachHere(); 3447 return -1; 3448 } 3449 } 3450 3451 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3452 switch (eltype) { 3453 // T_BOOLEAN used as surrogate for unsigned byte 3454 case T_BOOLEAN: movzbl(dst, src); break; 3455 case T_BYTE: movsbl(dst, src); break; 3456 case T_SHORT: movswl(dst, src); break; 3457 case T_CHAR: movzwl(dst, src); break; 3458 case T_INT: movl(dst, src); break; 3459 default: 3460 ShouldNotReachHere(); 3461 } 3462 } 3463 3464 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3465 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3466 } 3467 3468 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3469 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3470 } 3471 3472 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3473 const int vlen = Assembler::AVX_256bit; 3474 switch (eltype) { 3475 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3476 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3477 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3478 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3479 case T_INT: 3480 // do nothing 3481 break; 3482 default: 3483 ShouldNotReachHere(); 3484 } 3485 } 3486 3487 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3488 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3489 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3490 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3491 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3492 BasicType eltype) { 3493 ShortBranchVerifier sbv(this); 3494 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3495 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3496 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3497 3498 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3499 SHORT_UNROLLED_LOOP_EXIT, 3500 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3501 UNROLLED_VECTOR_LOOP_BEGIN, 3502 END; 3503 switch (eltype) { 3504 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3505 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3506 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3507 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3508 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3509 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3510 } 3511 3512 // For "renaming" for readibility of the code 3513 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3514 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3515 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3516 3517 const int elsize = arrays_hashcode_elsize(eltype); 3518 3519 /* 3520 if (cnt1 >= 2) { 3521 if (cnt1 >= 32) { 3522 UNROLLED VECTOR LOOP 3523 } 3524 UNROLLED SCALAR LOOP 3525 } 3526 SINGLE SCALAR 3527 */ 3528 3529 cmpl(cnt1, 32); 3530 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3531 3532 // cnt1 >= 32 && generate_vectorized_loop 3533 xorl(index, index); 3534 3535 // vresult = IntVector.zero(I256); 3536 for (int idx = 0; idx < 4; idx++) { 3537 vpxor(vresult[idx], vresult[idx]); 3538 } 3539 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3540 Register bound = tmp2; 3541 Register next = tmp3; 3542 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3543 movl(next, Address(tmp2, 0)); 3544 movdl(vnext, next); 3545 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3546 3547 // index = 0; 3548 // bound = cnt1 & ~(32 - 1); 3549 movl(bound, cnt1); 3550 andl(bound, ~(32 - 1)); 3551 // for (; index < bound; index += 32) { 3552 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3553 // result *= next; 3554 imull(result, next); 3555 // loop fission to upfront the cost of fetching from memory, OOO execution 3556 // can then hopefully do a better job of prefetching 3557 for (int idx = 0; idx < 4; idx++) { 3558 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3559 } 3560 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3561 for (int idx = 0; idx < 4; idx++) { 3562 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3563 arrays_hashcode_elvcast(vtmp[idx], eltype); 3564 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3565 } 3566 // index += 32; 3567 addl(index, 32); 3568 // index < bound; 3569 cmpl(index, bound); 3570 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3571 // } 3572 3573 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3574 subl(cnt1, bound); 3575 // release bound 3576 3577 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3578 for (int idx = 0; idx < 4; idx++) { 3579 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3580 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3581 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3582 } 3583 // result += vresult.reduceLanes(ADD); 3584 for (int idx = 0; idx < 4; idx++) { 3585 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3586 } 3587 3588 // } else if (cnt1 < 32) { 3589 3590 bind(SHORT_UNROLLED_BEGIN); 3591 // int i = 1; 3592 movl(index, 1); 3593 cmpl(index, cnt1); 3594 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3595 3596 // for (; i < cnt1 ; i += 2) { 3597 bind(SHORT_UNROLLED_LOOP_BEGIN); 3598 movl(tmp3, 961); 3599 imull(result, tmp3); 3600 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3601 movl(tmp3, tmp2); 3602 shll(tmp3, 5); 3603 subl(tmp3, tmp2); 3604 addl(result, tmp3); 3605 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3606 addl(result, tmp3); 3607 addl(index, 2); 3608 cmpl(index, cnt1); 3609 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3610 3611 // } 3612 // if (i >= cnt1) { 3613 bind(SHORT_UNROLLED_LOOP_EXIT); 3614 jccb(Assembler::greater, END); 3615 movl(tmp2, result); 3616 shll(result, 5); 3617 subl(result, tmp2); 3618 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3619 addl(result, tmp3); 3620 // } 3621 bind(END); 3622 3623 BLOCK_COMMENT("} // arrays_hashcode"); 3624 3625 } // arrays_hashcode 3626 3627 // helper function for string_compare 3628 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3629 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3630 Address::ScaleFactor scale2, Register index, int ae) { 3631 if (ae == StrIntrinsicNode::LL) { 3632 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3633 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3634 } else if (ae == StrIntrinsicNode::UU) { 3635 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3636 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3637 } else { 3638 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3639 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3640 } 3641 } 3642 3643 // Compare strings, used for char[] and byte[]. 3644 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3645 Register cnt1, Register cnt2, Register result, 3646 XMMRegister vec1, int ae, KRegister mask) { 3647 ShortBranchVerifier sbv(this); 3648 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3649 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3 3650 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3651 int stride2x2 = 0x40; 3652 Address::ScaleFactor scale = Address::no_scale; 3653 Address::ScaleFactor scale1 = Address::no_scale; 3654 Address::ScaleFactor scale2 = Address::no_scale; 3655 3656 if (ae != StrIntrinsicNode::LL) { 3657 stride2x2 = 0x20; 3658 } 3659 3660 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3661 shrl(cnt2, 1); 3662 } 3663 // Compute the minimum of the string lengths and the 3664 // difference of the string lengths (stack). 3665 // Do the conditional move stuff 3666 movl(result, cnt1); 3667 subl(cnt1, cnt2); 3668 push(cnt1); 3669 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3670 3671 // Is the minimum length zero? 3672 testl(cnt2, cnt2); 3673 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3674 if (ae == StrIntrinsicNode::LL) { 3675 // Load first bytes 3676 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3677 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3678 } else if (ae == StrIntrinsicNode::UU) { 3679 // Load first characters 3680 load_unsigned_short(result, Address(str1, 0)); 3681 load_unsigned_short(cnt1, Address(str2, 0)); 3682 } else { 3683 load_unsigned_byte(result, Address(str1, 0)); 3684 load_unsigned_short(cnt1, Address(str2, 0)); 3685 } 3686 subl(result, cnt1); 3687 jcc(Assembler::notZero, POP_LABEL); 3688 3689 if (ae == StrIntrinsicNode::UU) { 3690 // Divide length by 2 to get number of chars 3691 shrl(cnt2, 1); 3692 } 3693 cmpl(cnt2, 1); 3694 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3695 3696 // Check if the strings start at the same location and setup scale and stride 3697 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3698 cmpptr(str1, str2); 3699 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3700 if (ae == StrIntrinsicNode::LL) { 3701 scale = Address::times_1; 3702 stride = 16; 3703 } else { 3704 scale = Address::times_2; 3705 stride = 8; 3706 } 3707 } else { 3708 scale1 = Address::times_1; 3709 scale2 = Address::times_2; 3710 // scale not used 3711 stride = 8; 3712 } 3713 3714 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3715 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3716 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3717 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3718 Label COMPARE_TAIL_LONG; 3719 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3 3720 3721 int pcmpmask = 0x19; 3722 if (ae == StrIntrinsicNode::LL) { 3723 pcmpmask &= ~0x01; 3724 } 3725 3726 // Setup to compare 16-chars (32-bytes) vectors, 3727 // start from first character again because it has aligned address. 3728 if (ae == StrIntrinsicNode::LL) { 3729 stride2 = 32; 3730 } else { 3731 stride2 = 16; 3732 } 3733 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3734 adr_stride = stride << scale; 3735 } else { 3736 adr_stride1 = 8; //stride << scale1; 3737 adr_stride2 = 16; //stride << scale2; 3738 } 3739 3740 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3741 // rax and rdx are used by pcmpestri as elements counters 3742 movl(result, cnt2); 3743 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3744 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3745 3746 // fast path : compare first 2 8-char vectors. 3747 bind(COMPARE_16_CHARS); 3748 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3749 movdqu(vec1, Address(str1, 0)); 3750 } else { 3751 pmovzxbw(vec1, Address(str1, 0)); 3752 } 3753 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3754 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3755 3756 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3757 movdqu(vec1, Address(str1, adr_stride)); 3758 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3759 } else { 3760 pmovzxbw(vec1, Address(str1, adr_stride1)); 3761 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3762 } 3763 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3764 addl(cnt1, stride); 3765 3766 // Compare the characters at index in cnt1 3767 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3768 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3769 subl(result, cnt2); 3770 jmp(POP_LABEL); 3771 3772 // Setup the registers to start vector comparison loop 3773 bind(COMPARE_WIDE_VECTORS); 3774 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3775 lea(str1, Address(str1, result, scale)); 3776 lea(str2, Address(str2, result, scale)); 3777 } else { 3778 lea(str1, Address(str1, result, scale1)); 3779 lea(str2, Address(str2, result, scale2)); 3780 } 3781 subl(result, stride2); 3782 subl(cnt2, stride2); 3783 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3784 negptr(result); 3785 3786 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3787 bind(COMPARE_WIDE_VECTORS_LOOP); 3788 3789 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3790 cmpl(cnt2, stride2x2); 3791 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3792 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3793 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3794 3795 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3796 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3797 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3798 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3799 } else { 3800 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3801 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3802 } 3803 kortestql(mask, mask); 3804 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3805 addptr(result, stride2x2); // update since we already compared at this addr 3806 subl(cnt2, stride2x2); // and sub the size too 3807 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3808 3809 vpxor(vec1, vec1); 3810 jmpb(COMPARE_WIDE_TAIL); 3811 }//if (VM_Version::supports_avx512vlbw()) 3812 3813 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3814 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3815 vmovdqu(vec1, Address(str1, result, scale)); 3816 vpxor(vec1, Address(str2, result, scale)); 3817 } else { 3818 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3819 vpxor(vec1, Address(str2, result, scale2)); 3820 } 3821 vptest(vec1, vec1); 3822 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3823 addptr(result, stride2); 3824 subl(cnt2, stride2); 3825 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3826 // clean upper bits of YMM registers 3827 vpxor(vec1, vec1); 3828 3829 // compare wide vectors tail 3830 bind(COMPARE_WIDE_TAIL); 3831 testptr(result, result); 3832 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3833 3834 movl(result, stride2); 3835 movl(cnt2, result); 3836 negptr(result); 3837 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3838 3839 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3840 bind(VECTOR_NOT_EQUAL); 3841 // clean upper bits of YMM registers 3842 vpxor(vec1, vec1); 3843 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3844 lea(str1, Address(str1, result, scale)); 3845 lea(str2, Address(str2, result, scale)); 3846 } else { 3847 lea(str1, Address(str1, result, scale1)); 3848 lea(str2, Address(str2, result, scale2)); 3849 } 3850 jmp(COMPARE_16_CHARS); 3851 3852 // Compare tail chars, length between 1 to 15 chars 3853 bind(COMPARE_TAIL_LONG); 3854 movl(cnt2, result); 3855 cmpl(cnt2, stride); 3856 jcc(Assembler::less, COMPARE_SMALL_STR); 3857 3858 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3859 movdqu(vec1, Address(str1, 0)); 3860 } else { 3861 pmovzxbw(vec1, Address(str1, 0)); 3862 } 3863 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3864 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3865 subptr(cnt2, stride); 3866 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3867 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3868 lea(str1, Address(str1, result, scale)); 3869 lea(str2, Address(str2, result, scale)); 3870 } else { 3871 lea(str1, Address(str1, result, scale1)); 3872 lea(str2, Address(str2, result, scale2)); 3873 } 3874 negptr(cnt2); 3875 jmpb(WHILE_HEAD_LABEL); 3876 3877 bind(COMPARE_SMALL_STR); 3878 } else if (UseSSE42Intrinsics) { 3879 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3880 int pcmpmask = 0x19; 3881 // Setup to compare 8-char (16-byte) vectors, 3882 // start from first character again because it has aligned address. 3883 movl(result, cnt2); 3884 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3885 if (ae == StrIntrinsicNode::LL) { 3886 pcmpmask &= ~0x01; 3887 } 3888 jcc(Assembler::zero, COMPARE_TAIL); 3889 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3890 lea(str1, Address(str1, result, scale)); 3891 lea(str2, Address(str2, result, scale)); 3892 } else { 3893 lea(str1, Address(str1, result, scale1)); 3894 lea(str2, Address(str2, result, scale2)); 3895 } 3896 negptr(result); 3897 3898 // pcmpestri 3899 // inputs: 3900 // vec1- substring 3901 // rax - negative string length (elements count) 3902 // mem - scanned string 3903 // rdx - string length (elements count) 3904 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3905 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3906 // outputs: 3907 // rcx - first mismatched element index 3908 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3909 3910 bind(COMPARE_WIDE_VECTORS); 3911 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3912 movdqu(vec1, Address(str1, result, scale)); 3913 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3914 } else { 3915 pmovzxbw(vec1, Address(str1, result, scale1)); 3916 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3917 } 3918 // After pcmpestri cnt1(rcx) contains mismatched element index 3919 3920 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3921 addptr(result, stride); 3922 subptr(cnt2, stride); 3923 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3924 3925 // compare wide vectors tail 3926 testptr(result, result); 3927 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3928 3929 movl(cnt2, stride); 3930 movl(result, stride); 3931 negptr(result); 3932 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3933 movdqu(vec1, Address(str1, result, scale)); 3934 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3935 } else { 3936 pmovzxbw(vec1, Address(str1, result, scale1)); 3937 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3938 } 3939 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3940 3941 // Mismatched characters in the vectors 3942 bind(VECTOR_NOT_EQUAL); 3943 addptr(cnt1, result); 3944 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3945 subl(result, cnt2); 3946 jmpb(POP_LABEL); 3947 3948 bind(COMPARE_TAIL); // limit is zero 3949 movl(cnt2, result); 3950 // Fallthru to tail compare 3951 } 3952 // Shift str2 and str1 to the end of the arrays, negate min 3953 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3954 lea(str1, Address(str1, cnt2, scale)); 3955 lea(str2, Address(str2, cnt2, scale)); 3956 } else { 3957 lea(str1, Address(str1, cnt2, scale1)); 3958 lea(str2, Address(str2, cnt2, scale2)); 3959 } 3960 decrementl(cnt2); // first character was compared already 3961 negptr(cnt2); 3962 3963 // Compare the rest of the elements 3964 bind(WHILE_HEAD_LABEL); 3965 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3966 subl(result, cnt1); 3967 jccb(Assembler::notZero, POP_LABEL); 3968 increment(cnt2); 3969 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3970 3971 // Strings are equal up to min length. Return the length difference. 3972 bind(LENGTH_DIFF_LABEL); 3973 pop(result); 3974 if (ae == StrIntrinsicNode::UU) { 3975 // Divide diff by 2 to get number of chars 3976 sarl(result, 1); 3977 } 3978 jmpb(DONE_LABEL); 3979 3980 if (VM_Version::supports_avx512vlbw()) { 3981 3982 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3983 3984 kmovql(cnt1, mask); 3985 notq(cnt1); 3986 bsfq(cnt2, cnt1); 3987 if (ae != StrIntrinsicNode::LL) { 3988 // Divide diff by 2 to get number of chars 3989 sarl(cnt2, 1); 3990 } 3991 addq(result, cnt2); 3992 if (ae == StrIntrinsicNode::LL) { 3993 load_unsigned_byte(cnt1, Address(str2, result)); 3994 load_unsigned_byte(result, Address(str1, result)); 3995 } else if (ae == StrIntrinsicNode::UU) { 3996 load_unsigned_short(cnt1, Address(str2, result, scale)); 3997 load_unsigned_short(result, Address(str1, result, scale)); 3998 } else { 3999 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4000 load_unsigned_byte(result, Address(str1, result, scale1)); 4001 } 4002 subl(result, cnt1); 4003 jmpb(POP_LABEL); 4004 }//if (VM_Version::supports_avx512vlbw()) 4005 4006 // Discard the stored length difference 4007 bind(POP_LABEL); 4008 pop(cnt1); 4009 4010 // That's it 4011 bind(DONE_LABEL); 4012 if(ae == StrIntrinsicNode::UL) { 4013 negl(result); 4014 } 4015 4016 } 4017 4018 // Search for Non-ASCII character (Negative byte value) in a byte array, 4019 // return the index of the first such character, otherwise the length 4020 // of the array segment searched. 4021 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4022 // @IntrinsicCandidate 4023 // public static int countPositives(byte[] ba, int off, int len) { 4024 // for (int i = off; i < off + len; i++) { 4025 // if (ba[i] < 0) { 4026 // return i - off; 4027 // } 4028 // } 4029 // return len; 4030 // } 4031 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4032 Register result, Register tmp1, 4033 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4034 // rsi: byte array 4035 // rcx: len 4036 // rax: result 4037 ShortBranchVerifier sbv(this); 4038 assert_different_registers(ary1, len, result, tmp1); 4039 assert_different_registers(vec1, vec2); 4040 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4041 4042 movl(result, len); // copy 4043 // len == 0 4044 testl(len, len); 4045 jcc(Assembler::zero, DONE); 4046 4047 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4048 VM_Version::supports_avx512vlbw() && 4049 VM_Version::supports_bmi2()) { 4050 4051 Label test_64_loop, test_tail, BREAK_LOOP; 4052 movl(tmp1, len); 4053 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4054 4055 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4056 andl(len, 0xffffffc0); // vector count (in chars) 4057 jccb(Assembler::zero, test_tail); 4058 4059 lea(ary1, Address(ary1, len, Address::times_1)); 4060 negptr(len); 4061 4062 bind(test_64_loop); 4063 // Check whether our 64 elements of size byte contain negatives 4064 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4065 kortestql(mask1, mask1); 4066 jcc(Assembler::notZero, BREAK_LOOP); 4067 4068 addptr(len, 64); 4069 jccb(Assembler::notZero, test_64_loop); 4070 4071 bind(test_tail); 4072 // bail out when there is nothing to be done 4073 testl(tmp1, -1); 4074 jcc(Assembler::zero, DONE); 4075 4076 4077 // check the tail for absense of negatives 4078 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4079 { 4080 Register tmp3_aliased = len; 4081 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4082 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4083 notq(tmp3_aliased); 4084 kmovql(mask2, tmp3_aliased); 4085 } 4086 4087 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4088 ktestq(mask1, mask2); 4089 jcc(Assembler::zero, DONE); 4090 4091 // do a full check for negative registers in the tail 4092 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4093 // ary1 already pointing to the right place 4094 jmpb(TAIL_START); 4095 4096 bind(BREAK_LOOP); 4097 // At least one byte in the last 64 byte block was negative. 4098 // Set up to look at the last 64 bytes as if they were a tail 4099 lea(ary1, Address(ary1, len, Address::times_1)); 4100 addptr(result, len); 4101 // Ignore the very last byte: if all others are positive, 4102 // it must be negative, so we can skip right to the 2+1 byte 4103 // end comparison at this point 4104 orl(result, 63); 4105 movl(len, 63); 4106 // Fallthru to tail compare 4107 } else { 4108 4109 if (UseAVX >= 2) { 4110 // With AVX2, use 32-byte vector compare 4111 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4112 4113 // Compare 32-byte vectors 4114 testl(len, 0xffffffe0); // vector count (in bytes) 4115 jccb(Assembler::zero, TAIL_START); 4116 4117 andl(len, 0xffffffe0); 4118 lea(ary1, Address(ary1, len, Address::times_1)); 4119 negptr(len); 4120 4121 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4122 movdl(vec2, tmp1); 4123 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4124 4125 bind(COMPARE_WIDE_VECTORS); 4126 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4127 vptest(vec1, vec2); 4128 jccb(Assembler::notZero, BREAK_LOOP); 4129 addptr(len, 32); 4130 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4131 4132 testl(result, 0x0000001f); // any bytes remaining? 4133 jcc(Assembler::zero, DONE); 4134 4135 // Quick test using the already prepared vector mask 4136 movl(len, result); 4137 andl(len, 0x0000001f); 4138 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4139 vptest(vec1, vec2); 4140 jcc(Assembler::zero, DONE); 4141 // There are zeros, jump to the tail to determine exactly where 4142 jmpb(TAIL_START); 4143 4144 bind(BREAK_LOOP); 4145 // At least one byte in the last 32-byte vector is negative. 4146 // Set up to look at the last 32 bytes as if they were a tail 4147 lea(ary1, Address(ary1, len, Address::times_1)); 4148 addptr(result, len); 4149 // Ignore the very last byte: if all others are positive, 4150 // it must be negative, so we can skip right to the 2+1 byte 4151 // end comparison at this point 4152 orl(result, 31); 4153 movl(len, 31); 4154 // Fallthru to tail compare 4155 } else if (UseSSE42Intrinsics) { 4156 // With SSE4.2, use double quad vector compare 4157 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4158 4159 // Compare 16-byte vectors 4160 testl(len, 0xfffffff0); // vector count (in bytes) 4161 jcc(Assembler::zero, TAIL_START); 4162 4163 andl(len, 0xfffffff0); 4164 lea(ary1, Address(ary1, len, Address::times_1)); 4165 negptr(len); 4166 4167 movl(tmp1, 0x80808080); 4168 movdl(vec2, tmp1); 4169 pshufd(vec2, vec2, 0); 4170 4171 bind(COMPARE_WIDE_VECTORS); 4172 movdqu(vec1, Address(ary1, len, Address::times_1)); 4173 ptest(vec1, vec2); 4174 jccb(Assembler::notZero, BREAK_LOOP); 4175 addptr(len, 16); 4176 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4177 4178 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4179 jcc(Assembler::zero, DONE); 4180 4181 // Quick test using the already prepared vector mask 4182 movl(len, result); 4183 andl(len, 0x0000000f); // tail count (in bytes) 4184 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4185 ptest(vec1, vec2); 4186 jcc(Assembler::zero, DONE); 4187 jmpb(TAIL_START); 4188 4189 bind(BREAK_LOOP); 4190 // At least one byte in the last 16-byte vector is negative. 4191 // Set up and look at the last 16 bytes as if they were a tail 4192 lea(ary1, Address(ary1, len, Address::times_1)); 4193 addptr(result, len); 4194 // Ignore the very last byte: if all others are positive, 4195 // it must be negative, so we can skip right to the 2+1 byte 4196 // end comparison at this point 4197 orl(result, 15); 4198 movl(len, 15); 4199 // Fallthru to tail compare 4200 } 4201 } 4202 4203 bind(TAIL_START); 4204 // Compare 4-byte vectors 4205 andl(len, 0xfffffffc); // vector count (in bytes) 4206 jccb(Assembler::zero, COMPARE_CHAR); 4207 4208 lea(ary1, Address(ary1, len, Address::times_1)); 4209 negptr(len); 4210 4211 bind(COMPARE_VECTORS); 4212 movl(tmp1, Address(ary1, len, Address::times_1)); 4213 andl(tmp1, 0x80808080); 4214 jccb(Assembler::notZero, TAIL_ADJUST); 4215 addptr(len, 4); 4216 jccb(Assembler::notZero, COMPARE_VECTORS); 4217 4218 // Compare trailing char (final 2-3 bytes), if any 4219 bind(COMPARE_CHAR); 4220 4221 testl(result, 0x2); // tail char 4222 jccb(Assembler::zero, COMPARE_BYTE); 4223 load_unsigned_short(tmp1, Address(ary1, 0)); 4224 andl(tmp1, 0x00008080); 4225 jccb(Assembler::notZero, CHAR_ADJUST); 4226 lea(ary1, Address(ary1, 2)); 4227 4228 bind(COMPARE_BYTE); 4229 testl(result, 0x1); // tail byte 4230 jccb(Assembler::zero, DONE); 4231 load_unsigned_byte(tmp1, Address(ary1, 0)); 4232 testl(tmp1, 0x00000080); 4233 jccb(Assembler::zero, DONE); 4234 subptr(result, 1); 4235 jmpb(DONE); 4236 4237 bind(TAIL_ADJUST); 4238 // there are negative bits in the last 4 byte block. 4239 // Adjust result and check the next three bytes 4240 addptr(result, len); 4241 orl(result, 3); 4242 lea(ary1, Address(ary1, len, Address::times_1)); 4243 jmpb(COMPARE_CHAR); 4244 4245 bind(CHAR_ADJUST); 4246 // We are looking at a char + optional byte tail, and found that one 4247 // of the bytes in the char is negative. Adjust the result, check the 4248 // first byte and readjust if needed. 4249 andl(result, 0xfffffffc); 4250 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4251 jccb(Assembler::notZero, DONE); 4252 addptr(result, 1); 4253 4254 // That's it 4255 bind(DONE); 4256 if (UseAVX >= 2) { 4257 // clean upper bits of YMM registers 4258 vpxor(vec1, vec1); 4259 vpxor(vec2, vec2); 4260 } 4261 } 4262 4263 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4264 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4265 Register limit, Register result, Register chr, 4266 XMMRegister vec1, XMMRegister vec2, bool is_char, 4267 KRegister mask, bool expand_ary2) { 4268 // for expand_ary2, limit is the (smaller) size of the second array. 4269 ShortBranchVerifier sbv(this); 4270 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4271 4272 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4273 "Expansion only implemented for AVX2"); 4274 4275 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4276 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4277 4278 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4279 int scaleIncr = expand_ary2 ? 8 : 16; 4280 4281 if (is_array_equ) { 4282 // Check the input args 4283 cmpoop(ary1, ary2); 4284 jcc(Assembler::equal, TRUE_LABEL); 4285 4286 // Need additional checks for arrays_equals. 4287 testptr(ary1, ary1); 4288 jcc(Assembler::zero, FALSE_LABEL); 4289 testptr(ary2, ary2); 4290 jcc(Assembler::zero, FALSE_LABEL); 4291 4292 // Check the lengths 4293 movl(limit, Address(ary1, length_offset)); 4294 cmpl(limit, Address(ary2, length_offset)); 4295 jcc(Assembler::notEqual, FALSE_LABEL); 4296 } 4297 4298 // count == 0 4299 testl(limit, limit); 4300 jcc(Assembler::zero, TRUE_LABEL); 4301 4302 if (is_array_equ) { 4303 // Load array address 4304 lea(ary1, Address(ary1, base_offset)); 4305 lea(ary2, Address(ary2, base_offset)); 4306 } 4307 4308 if (is_array_equ && is_char) { 4309 // arrays_equals when used for char[]. 4310 shll(limit, 1); // byte count != 0 4311 } 4312 movl(result, limit); // copy 4313 4314 if (UseAVX >= 2) { 4315 // With AVX2, use 32-byte vector compare 4316 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4317 4318 // Compare 32-byte vectors 4319 if (expand_ary2) { 4320 andl(result, 0x0000000f); // tail count (in bytes) 4321 andl(limit, 0xfffffff0); // vector count (in bytes) 4322 jcc(Assembler::zero, COMPARE_TAIL); 4323 } else { 4324 andl(result, 0x0000001f); // tail count (in bytes) 4325 andl(limit, 0xffffffe0); // vector count (in bytes) 4326 jcc(Assembler::zero, COMPARE_TAIL_16); 4327 } 4328 4329 lea(ary1, Address(ary1, limit, scaleFactor)); 4330 lea(ary2, Address(ary2, limit, Address::times_1)); 4331 negptr(limit); 4332 4333 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4334 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4335 4336 cmpl(limit, -64); 4337 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4338 4339 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4340 4341 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4342 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4343 kortestql(mask, mask); 4344 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4345 addptr(limit, 64); // update since we already compared at this addr 4346 cmpl(limit, -64); 4347 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4348 4349 // At this point we may still need to compare -limit+result bytes. 4350 // We could execute the next two instruction and just continue via non-wide path: 4351 // cmpl(limit, 0); 4352 // jcc(Assembler::equal, COMPARE_TAIL); // true 4353 // But since we stopped at the points ary{1,2}+limit which are 4354 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4355 // (|limit| <= 32 and result < 32), 4356 // we may just compare the last 64 bytes. 4357 // 4358 addptr(result, -64); // it is safe, bc we just came from this area 4359 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4360 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4361 kortestql(mask, mask); 4362 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4363 4364 jmp(TRUE_LABEL); 4365 4366 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4367 4368 }//if (VM_Version::supports_avx512vlbw()) 4369 4370 bind(COMPARE_WIDE_VECTORS); 4371 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4372 if (expand_ary2) { 4373 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4374 } else { 4375 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4376 } 4377 vpxor(vec1, vec2); 4378 4379 vptest(vec1, vec1); 4380 jcc(Assembler::notZero, FALSE_LABEL); 4381 addptr(limit, scaleIncr * 2); 4382 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4383 4384 testl(result, result); 4385 jcc(Assembler::zero, TRUE_LABEL); 4386 4387 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4388 if (expand_ary2) { 4389 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4390 } else { 4391 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4392 } 4393 vpxor(vec1, vec2); 4394 4395 vptest(vec1, vec1); 4396 jcc(Assembler::notZero, FALSE_LABEL); 4397 jmp(TRUE_LABEL); 4398 4399 bind(COMPARE_TAIL_16); // limit is zero 4400 movl(limit, result); 4401 4402 // Compare 16-byte chunks 4403 andl(result, 0x0000000f); // tail count (in bytes) 4404 andl(limit, 0xfffffff0); // vector count (in bytes) 4405 jcc(Assembler::zero, COMPARE_TAIL); 4406 4407 lea(ary1, Address(ary1, limit, scaleFactor)); 4408 lea(ary2, Address(ary2, limit, Address::times_1)); 4409 negptr(limit); 4410 4411 bind(COMPARE_WIDE_VECTORS_16); 4412 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4413 if (expand_ary2) { 4414 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4415 } else { 4416 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4417 } 4418 pxor(vec1, vec2); 4419 4420 ptest(vec1, vec1); 4421 jcc(Assembler::notZero, FALSE_LABEL); 4422 addptr(limit, scaleIncr); 4423 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4424 4425 bind(COMPARE_TAIL); // limit is zero 4426 movl(limit, result); 4427 // Fallthru to tail compare 4428 } else if (UseSSE42Intrinsics) { 4429 // With SSE4.2, use double quad vector compare 4430 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4431 4432 // Compare 16-byte vectors 4433 andl(result, 0x0000000f); // tail count (in bytes) 4434 andl(limit, 0xfffffff0); // vector count (in bytes) 4435 jcc(Assembler::zero, COMPARE_TAIL); 4436 4437 lea(ary1, Address(ary1, limit, Address::times_1)); 4438 lea(ary2, Address(ary2, limit, Address::times_1)); 4439 negptr(limit); 4440 4441 bind(COMPARE_WIDE_VECTORS); 4442 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4443 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4444 pxor(vec1, vec2); 4445 4446 ptest(vec1, vec1); 4447 jcc(Assembler::notZero, FALSE_LABEL); 4448 addptr(limit, 16); 4449 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4450 4451 testl(result, result); 4452 jcc(Assembler::zero, TRUE_LABEL); 4453 4454 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4455 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4456 pxor(vec1, vec2); 4457 4458 ptest(vec1, vec1); 4459 jccb(Assembler::notZero, FALSE_LABEL); 4460 jmpb(TRUE_LABEL); 4461 4462 bind(COMPARE_TAIL); // limit is zero 4463 movl(limit, result); 4464 // Fallthru to tail compare 4465 } 4466 4467 // Compare 4-byte vectors 4468 if (expand_ary2) { 4469 testl(result, result); 4470 jccb(Assembler::zero, TRUE_LABEL); 4471 } else { 4472 andl(limit, 0xfffffffc); // vector count (in bytes) 4473 jccb(Assembler::zero, COMPARE_CHAR); 4474 } 4475 4476 lea(ary1, Address(ary1, limit, scaleFactor)); 4477 lea(ary2, Address(ary2, limit, Address::times_1)); 4478 negptr(limit); 4479 4480 bind(COMPARE_VECTORS); 4481 if (expand_ary2) { 4482 // There are no "vector" operations for bytes to shorts 4483 movzbl(chr, Address(ary2, limit, Address::times_1)); 4484 cmpw(Address(ary1, limit, Address::times_2), chr); 4485 jccb(Assembler::notEqual, FALSE_LABEL); 4486 addptr(limit, 1); 4487 jcc(Assembler::notZero, COMPARE_VECTORS); 4488 jmp(TRUE_LABEL); 4489 } else { 4490 movl(chr, Address(ary1, limit, Address::times_1)); 4491 cmpl(chr, Address(ary2, limit, Address::times_1)); 4492 jccb(Assembler::notEqual, FALSE_LABEL); 4493 addptr(limit, 4); 4494 jcc(Assembler::notZero, COMPARE_VECTORS); 4495 } 4496 4497 // Compare trailing char (final 2 bytes), if any 4498 bind(COMPARE_CHAR); 4499 testl(result, 0x2); // tail char 4500 jccb(Assembler::zero, COMPARE_BYTE); 4501 load_unsigned_short(chr, Address(ary1, 0)); 4502 load_unsigned_short(limit, Address(ary2, 0)); 4503 cmpl(chr, limit); 4504 jccb(Assembler::notEqual, FALSE_LABEL); 4505 4506 if (is_array_equ && is_char) { 4507 bind(COMPARE_BYTE); 4508 } else { 4509 lea(ary1, Address(ary1, 2)); 4510 lea(ary2, Address(ary2, 2)); 4511 4512 bind(COMPARE_BYTE); 4513 testl(result, 0x1); // tail byte 4514 jccb(Assembler::zero, TRUE_LABEL); 4515 load_unsigned_byte(chr, Address(ary1, 0)); 4516 load_unsigned_byte(limit, Address(ary2, 0)); 4517 cmpl(chr, limit); 4518 jccb(Assembler::notEqual, FALSE_LABEL); 4519 } 4520 bind(TRUE_LABEL); 4521 movl(result, 1); // return true 4522 jmpb(DONE); 4523 4524 bind(FALSE_LABEL); 4525 xorl(result, result); // return false 4526 4527 // That's it 4528 bind(DONE); 4529 if (UseAVX >= 2) { 4530 // clean upper bits of YMM registers 4531 vpxor(vec1, vec1); 4532 vpxor(vec2, vec2); 4533 } 4534 } 4535 4536 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4537 #define __ masm. 4538 Register dst = stub.data<0>(); 4539 XMMRegister src = stub.data<1>(); 4540 address target = stub.data<2>(); 4541 __ bind(stub.entry()); 4542 __ subptr(rsp, 8); 4543 __ movdbl(Address(rsp), src); 4544 __ call(RuntimeAddress(target)); 4545 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte. 4546 __ pop(dst); 4547 __ jmp(stub.continuation()); 4548 #undef __ 4549 } 4550 4551 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4552 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4553 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4554 4555 address slowpath_target; 4556 if (dst_bt == T_INT) { 4557 if (src_bt == T_FLOAT) { 4558 cvttss2sil(dst, src); 4559 cmpl(dst, 0x80000000); 4560 slowpath_target = StubRoutines::x86::f2i_fixup(); 4561 } else { 4562 cvttsd2sil(dst, src); 4563 cmpl(dst, 0x80000000); 4564 slowpath_target = StubRoutines::x86::d2i_fixup(); 4565 } 4566 } else { 4567 if (src_bt == T_FLOAT) { 4568 cvttss2siq(dst, src); 4569 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4570 slowpath_target = StubRoutines::x86::f2l_fixup(); 4571 } else { 4572 cvttsd2siq(dst, src); 4573 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4574 slowpath_target = StubRoutines::x86::d2l_fixup(); 4575 } 4576 } 4577 4578 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte. 4579 int max_size = 23 + (UseAPX ? 1 : 0); 4580 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath); 4581 jcc(Assembler::equal, stub->entry()); 4582 bind(stub->continuation()); 4583 } 4584 4585 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4586 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4587 switch(ideal_opc) { 4588 case Op_LShiftVS: 4589 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4590 case Op_LShiftVI: 4591 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4592 case Op_LShiftVL: 4593 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4594 case Op_RShiftVS: 4595 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4596 case Op_RShiftVI: 4597 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4598 case Op_RShiftVL: 4599 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4600 case Op_URShiftVS: 4601 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4602 case Op_URShiftVI: 4603 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4604 case Op_URShiftVL: 4605 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4606 case Op_RotateRightV: 4607 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4608 case Op_RotateLeftV: 4609 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4610 default: 4611 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4612 break; 4613 } 4614 } 4615 4616 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4617 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4618 if (is_unsigned) { 4619 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4620 } else { 4621 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4622 } 4623 } 4624 4625 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4626 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4627 switch (elem_bt) { 4628 case T_BYTE: 4629 if (ideal_opc == Op_SaturatingAddV) { 4630 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4631 } else { 4632 assert(ideal_opc == Op_SaturatingSubV, ""); 4633 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4634 } 4635 break; 4636 case T_SHORT: 4637 if (ideal_opc == Op_SaturatingAddV) { 4638 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4639 } else { 4640 assert(ideal_opc == Op_SaturatingSubV, ""); 4641 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4642 } 4643 break; 4644 default: 4645 fatal("Unsupported type %s", type2name(elem_bt)); 4646 break; 4647 } 4648 } 4649 4650 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4651 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4652 switch (elem_bt) { 4653 case T_BYTE: 4654 if (ideal_opc == Op_SaturatingAddV) { 4655 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4656 } else { 4657 assert(ideal_opc == Op_SaturatingSubV, ""); 4658 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4659 } 4660 break; 4661 case T_SHORT: 4662 if (ideal_opc == Op_SaturatingAddV) { 4663 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4664 } else { 4665 assert(ideal_opc == Op_SaturatingSubV, ""); 4666 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4667 } 4668 break; 4669 default: 4670 fatal("Unsupported type %s", type2name(elem_bt)); 4671 break; 4672 } 4673 } 4674 4675 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4676 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4677 if (is_unsigned) { 4678 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4679 } else { 4680 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4681 } 4682 } 4683 4684 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4685 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4686 switch (elem_bt) { 4687 case T_BYTE: 4688 if (ideal_opc == Op_SaturatingAddV) { 4689 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4690 } else { 4691 assert(ideal_opc == Op_SaturatingSubV, ""); 4692 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4693 } 4694 break; 4695 case T_SHORT: 4696 if (ideal_opc == Op_SaturatingAddV) { 4697 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4698 } else { 4699 assert(ideal_opc == Op_SaturatingSubV, ""); 4700 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4701 } 4702 break; 4703 default: 4704 fatal("Unsupported type %s", type2name(elem_bt)); 4705 break; 4706 } 4707 } 4708 4709 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4710 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4711 switch (elem_bt) { 4712 case T_BYTE: 4713 if (ideal_opc == Op_SaturatingAddV) { 4714 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4715 } else { 4716 assert(ideal_opc == Op_SaturatingSubV, ""); 4717 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4718 } 4719 break; 4720 case T_SHORT: 4721 if (ideal_opc == Op_SaturatingAddV) { 4722 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4723 } else { 4724 assert(ideal_opc == Op_SaturatingSubV, ""); 4725 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4726 } 4727 break; 4728 default: 4729 fatal("Unsupported type %s", type2name(elem_bt)); 4730 break; 4731 } 4732 } 4733 4734 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4735 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4736 bool is_varshift) { 4737 switch (ideal_opc) { 4738 case Op_AddVB: 4739 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4740 case Op_AddVS: 4741 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4742 case Op_AddVI: 4743 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4744 case Op_AddVL: 4745 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4746 case Op_AddVF: 4747 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4748 case Op_AddVD: 4749 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4750 case Op_SubVB: 4751 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4752 case Op_SubVS: 4753 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4754 case Op_SubVI: 4755 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4756 case Op_SubVL: 4757 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4758 case Op_SubVF: 4759 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4760 case Op_SubVD: 4761 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4762 case Op_MulVS: 4763 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4764 case Op_MulVI: 4765 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4766 case Op_MulVL: 4767 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4768 case Op_MulVF: 4769 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4770 case Op_MulVD: 4771 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4772 case Op_DivVF: 4773 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4774 case Op_DivVD: 4775 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4776 case Op_SqrtVF: 4777 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4778 case Op_SqrtVD: 4779 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4780 case Op_AbsVB: 4781 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4782 case Op_AbsVS: 4783 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4784 case Op_AbsVI: 4785 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4786 case Op_AbsVL: 4787 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4788 case Op_FmaVF: 4789 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4790 case Op_FmaVD: 4791 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4792 case Op_VectorRearrange: 4793 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4794 case Op_LShiftVS: 4795 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4796 case Op_LShiftVI: 4797 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4798 case Op_LShiftVL: 4799 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4800 case Op_RShiftVS: 4801 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4802 case Op_RShiftVI: 4803 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4804 case Op_RShiftVL: 4805 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4806 case Op_URShiftVS: 4807 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4808 case Op_URShiftVI: 4809 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4810 case Op_URShiftVL: 4811 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4812 case Op_RotateLeftV: 4813 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4814 case Op_RotateRightV: 4815 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4816 case Op_MaxV: 4817 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4818 case Op_MinV: 4819 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4820 case Op_UMinV: 4821 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4822 case Op_UMaxV: 4823 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4824 case Op_XorV: 4825 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4826 case Op_OrV: 4827 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4828 case Op_AndV: 4829 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4830 default: 4831 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4832 break; 4833 } 4834 } 4835 4836 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4837 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4838 switch (ideal_opc) { 4839 case Op_AddVB: 4840 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4841 case Op_AddVS: 4842 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4843 case Op_AddVI: 4844 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4845 case Op_AddVL: 4846 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4847 case Op_AddVF: 4848 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4849 case Op_AddVD: 4850 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4851 case Op_SubVB: 4852 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4853 case Op_SubVS: 4854 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4855 case Op_SubVI: 4856 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4857 case Op_SubVL: 4858 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4859 case Op_SubVF: 4860 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4861 case Op_SubVD: 4862 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4863 case Op_MulVS: 4864 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4865 case Op_MulVI: 4866 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4867 case Op_MulVL: 4868 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4869 case Op_MulVF: 4870 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4871 case Op_MulVD: 4872 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4873 case Op_DivVF: 4874 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4875 case Op_DivVD: 4876 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4877 case Op_FmaVF: 4878 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4879 case Op_FmaVD: 4880 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4881 case Op_MaxV: 4882 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4883 case Op_MinV: 4884 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4885 case Op_UMaxV: 4886 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4887 case Op_UMinV: 4888 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4889 case Op_XorV: 4890 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4891 case Op_OrV: 4892 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4893 case Op_AndV: 4894 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4895 default: 4896 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4897 break; 4898 } 4899 } 4900 4901 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4902 KRegister src1, KRegister src2) { 4903 BasicType etype = T_ILLEGAL; 4904 switch(mask_len) { 4905 case 2: 4906 case 4: 4907 case 8: etype = T_BYTE; break; 4908 case 16: etype = T_SHORT; break; 4909 case 32: etype = T_INT; break; 4910 case 64: etype = T_LONG; break; 4911 default: fatal("Unsupported type"); break; 4912 } 4913 assert(etype != T_ILLEGAL, ""); 4914 switch(ideal_opc) { 4915 case Op_AndVMask: 4916 kand(etype, dst, src1, src2); break; 4917 case Op_OrVMask: 4918 kor(etype, dst, src1, src2); break; 4919 case Op_XorVMask: 4920 kxor(etype, dst, src1, src2); break; 4921 default: 4922 fatal("Unsupported masked operation"); break; 4923 } 4924 } 4925 4926 /* 4927 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4928 * If src is NaN, the result is 0. 4929 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4930 * the result is equal to the value of Integer.MIN_VALUE. 4931 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4932 * the result is equal to the value of Integer.MAX_VALUE. 4933 */ 4934 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4935 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4936 Register rscratch, AddressLiteral float_sign_flip, 4937 int vec_enc) { 4938 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4939 Label done; 4940 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4941 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4942 vptest(xtmp2, xtmp2, vec_enc); 4943 jccb(Assembler::equal, done); 4944 4945 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4946 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4947 4948 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4949 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4950 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4951 4952 // Recompute the mask for remaining special value. 4953 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4954 // Extract SRC values corresponding to TRUE mask lanes. 4955 vpand(xtmp4, xtmp2, src, vec_enc); 4956 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4957 // values are set. 4958 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4959 4960 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4961 bind(done); 4962 } 4963 4964 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4965 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4966 Register rscratch, AddressLiteral float_sign_flip, 4967 int vec_enc) { 4968 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4969 Label done; 4970 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4971 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4972 kortestwl(ktmp1, ktmp1); 4973 jccb(Assembler::equal, done); 4974 4975 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4976 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4977 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4978 4979 kxorwl(ktmp1, ktmp1, ktmp2); 4980 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4981 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4982 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4983 bind(done); 4984 } 4985 4986 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4987 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4988 Register rscratch, AddressLiteral double_sign_flip, 4989 int vec_enc) { 4990 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4991 4992 Label done; 4993 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4994 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4995 kortestwl(ktmp1, ktmp1); 4996 jccb(Assembler::equal, done); 4997 4998 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4999 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5000 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5001 5002 kxorwl(ktmp1, ktmp1, ktmp2); 5003 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5004 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5005 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5006 bind(done); 5007 } 5008 5009 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5010 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5011 Register rscratch, AddressLiteral float_sign_flip, 5012 int vec_enc) { 5013 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5014 Label done; 5015 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5016 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5017 kortestwl(ktmp1, ktmp1); 5018 jccb(Assembler::equal, done); 5019 5020 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5021 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5022 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5023 5024 kxorwl(ktmp1, ktmp1, ktmp2); 5025 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5026 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5027 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5028 bind(done); 5029 } 5030 5031 /* 5032 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5033 * If src is NaN, the result is 0. 5034 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5035 * the result is equal to the value of Long.MIN_VALUE. 5036 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5037 * the result is equal to the value of Long.MAX_VALUE. 5038 */ 5039 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5040 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5041 Register rscratch, AddressLiteral double_sign_flip, 5042 int vec_enc) { 5043 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5044 5045 Label done; 5046 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5047 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5048 kortestwl(ktmp1, ktmp1); 5049 jccb(Assembler::equal, done); 5050 5051 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5052 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5053 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5054 5055 kxorwl(ktmp1, ktmp1, ktmp2); 5056 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5057 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5058 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5059 bind(done); 5060 } 5061 5062 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5063 XMMRegister xtmp, int index, int vec_enc) { 5064 assert(vec_enc < Assembler::AVX_512bit, ""); 5065 if (vec_enc == Assembler::AVX_256bit) { 5066 vextractf128_high(xtmp, src); 5067 vshufps(dst, src, xtmp, index, vec_enc); 5068 } else { 5069 vshufps(dst, src, zero, index, vec_enc); 5070 } 5071 } 5072 5073 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5074 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5075 AddressLiteral float_sign_flip, int src_vec_enc) { 5076 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5077 5078 Label done; 5079 // Compare the destination lanes with float_sign_flip 5080 // value to get mask for all special values. 5081 movdqu(xtmp1, float_sign_flip, rscratch); 5082 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5083 ptest(xtmp2, xtmp2); 5084 jccb(Assembler::equal, done); 5085 5086 // Flip float_sign_flip to get max integer value. 5087 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5088 pxor(xtmp1, xtmp4); 5089 5090 // Set detination lanes corresponding to unordered source lanes as zero. 5091 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5092 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5093 5094 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5095 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5096 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5097 5098 // Recompute the mask for remaining special value. 5099 pxor(xtmp2, xtmp3); 5100 // Extract mask corresponding to non-negative source lanes. 5101 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5102 5103 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5104 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5105 pand(xtmp3, xtmp2); 5106 5107 // Replace destination lanes holding special value(0x80000000) with max int 5108 // if corresponding source lane holds a +ve value. 5109 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5110 bind(done); 5111 } 5112 5113 5114 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5115 XMMRegister xtmp, Register rscratch, int vec_enc) { 5116 switch(to_elem_bt) { 5117 case T_SHORT: 5118 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5119 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5120 vpackusdw(dst, dst, zero, vec_enc); 5121 if (vec_enc == Assembler::AVX_256bit) { 5122 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5123 } 5124 break; 5125 case T_BYTE: 5126 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5127 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5128 vpackusdw(dst, dst, zero, vec_enc); 5129 if (vec_enc == Assembler::AVX_256bit) { 5130 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5131 } 5132 vpackuswb(dst, dst, zero, vec_enc); 5133 break; 5134 default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt)); 5135 } 5136 } 5137 5138 /* 5139 * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):- 5140 * a) Perform vector D2L/F2I cast. 5141 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5142 * It signifies that source value could be any of the special floating point 5143 * values(NaN,-Inf,Inf,Max,-Min). 5144 * c) Set destination to zero if source is NaN value. 5145 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5146 */ 5147 5148 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5149 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5150 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5151 int to_elem_sz = type2aelembytes(to_elem_bt); 5152 assert(to_elem_sz <= 4, ""); 5153 vcvttps2dq(dst, src, vec_enc); 5154 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5155 if (to_elem_sz < 4) { 5156 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5157 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5158 } 5159 } 5160 5161 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5162 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5163 Register rscratch, int vec_enc) { 5164 int to_elem_sz = type2aelembytes(to_elem_bt); 5165 assert(to_elem_sz <= 4, ""); 5166 vcvttps2dq(dst, src, vec_enc); 5167 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5168 switch(to_elem_bt) { 5169 case T_INT: 5170 break; 5171 case T_SHORT: 5172 evpmovdw(dst, dst, vec_enc); 5173 break; 5174 case T_BYTE: 5175 evpmovdb(dst, dst, vec_enc); 5176 break; 5177 default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt)); 5178 } 5179 } 5180 5181 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5182 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5183 Register rscratch, int vec_enc) { 5184 evcvttps2qq(dst, src, vec_enc); 5185 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5186 } 5187 5188 // Handling for downcasting from double to integer or sub-word types on AVX2. 5189 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5190 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5191 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5192 int to_elem_sz = type2aelembytes(to_elem_bt); 5193 assert(to_elem_sz < 8, ""); 5194 vcvttpd2dq(dst, src, vec_enc); 5195 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5196 float_sign_flip, vec_enc); 5197 if (to_elem_sz < 4) { 5198 // xtmp4 holds all zero lanes. 5199 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5200 } 5201 } 5202 5203 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5204 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5205 KRegister ktmp2, AddressLiteral sign_flip, 5206 Register rscratch, int vec_enc) { 5207 if (VM_Version::supports_avx512dq()) { 5208 evcvttpd2qq(dst, src, vec_enc); 5209 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5210 switch(to_elem_bt) { 5211 case T_LONG: 5212 break; 5213 case T_INT: 5214 evpmovsqd(dst, dst, vec_enc); 5215 break; 5216 case T_SHORT: 5217 evpmovsqd(dst, dst, vec_enc); 5218 evpmovdw(dst, dst, vec_enc); 5219 break; 5220 case T_BYTE: 5221 evpmovsqd(dst, dst, vec_enc); 5222 evpmovdb(dst, dst, vec_enc); 5223 break; 5224 default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt)); 5225 } 5226 } else { 5227 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5228 vcvttpd2dq(dst, src, vec_enc); 5229 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5230 switch(to_elem_bt) { 5231 case T_INT: 5232 break; 5233 case T_SHORT: 5234 evpmovdw(dst, dst, vec_enc); 5235 break; 5236 case T_BYTE: 5237 evpmovdb(dst, dst, vec_enc); 5238 break; 5239 default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt)); 5240 } 5241 } 5242 } 5243 5244 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5245 switch(to_elem_bt) { 5246 case T_LONG: 5247 evcvttps2qqs(dst, src, vec_enc); 5248 break; 5249 case T_INT: 5250 evcvttps2dqs(dst, src, vec_enc); 5251 break; 5252 case T_SHORT: 5253 evcvttps2dqs(dst, src, vec_enc); 5254 evpmovdw(dst, dst, vec_enc); 5255 break; 5256 case T_BYTE: 5257 evcvttps2dqs(dst, src, vec_enc); 5258 evpmovdb(dst, dst, vec_enc); 5259 break; 5260 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt)); 5261 } 5262 } 5263 5264 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) { 5265 switch(to_elem_bt) { 5266 case T_LONG: 5267 evcvttps2qqs(dst, src, vec_enc); 5268 break; 5269 case T_INT: 5270 evcvttps2dqs(dst, src, vec_enc); 5271 break; 5272 case T_SHORT: 5273 evcvttps2dqs(dst, src, vec_enc); 5274 evpmovdw(dst, dst, vec_enc); 5275 break; 5276 case T_BYTE: 5277 evcvttps2dqs(dst, src, vec_enc); 5278 evpmovdb(dst, dst, vec_enc); 5279 break; 5280 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt)); 5281 } 5282 } 5283 5284 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5285 switch(to_elem_bt) { 5286 case T_LONG: 5287 evcvttpd2qqs(dst, src, vec_enc); 5288 break; 5289 case T_INT: 5290 evcvttpd2dqs(dst, src, vec_enc); 5291 break; 5292 case T_SHORT: 5293 evcvttpd2dqs(dst, src, vec_enc); 5294 evpmovdw(dst, dst, vec_enc); 5295 break; 5296 case T_BYTE: 5297 evcvttpd2dqs(dst, src, vec_enc); 5298 evpmovdb(dst, dst, vec_enc); 5299 break; 5300 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt)); 5301 } 5302 } 5303 5304 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) { 5305 switch(to_elem_bt) { 5306 case T_LONG: 5307 evcvttpd2qqs(dst, src, vec_enc); 5308 break; 5309 case T_INT: 5310 evcvttpd2dqs(dst, src, vec_enc); 5311 break; 5312 case T_SHORT: 5313 evcvttpd2dqs(dst, src, vec_enc); 5314 evpmovdw(dst, dst, vec_enc); 5315 break; 5316 case T_BYTE: 5317 evcvttpd2dqs(dst, src, vec_enc); 5318 evpmovdb(dst, dst, vec_enc); 5319 break; 5320 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt)); 5321 } 5322 } 5323 5324 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5325 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5326 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5327 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5328 // and re-instantiate original MXCSR.RC mode after that. 5329 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5330 5331 mov64(tmp, julong_cast(0.5L)); 5332 evpbroadcastq(xtmp1, tmp, vec_enc); 5333 vaddpd(xtmp1, src , xtmp1, vec_enc); 5334 evcvtpd2qq(dst, xtmp1, vec_enc); 5335 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5336 double_sign_flip, vec_enc);; 5337 5338 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5339 } 5340 5341 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5342 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5343 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5344 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5345 // and re-instantiate original MXCSR.RC mode after that. 5346 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5347 5348 movl(tmp, jint_cast(0.5)); 5349 movq(xtmp1, tmp); 5350 vbroadcastss(xtmp1, xtmp1, vec_enc); 5351 vaddps(xtmp1, src , xtmp1, vec_enc); 5352 vcvtps2dq(dst, xtmp1, vec_enc); 5353 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5354 float_sign_flip, vec_enc); 5355 5356 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5357 } 5358 5359 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5360 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5361 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5362 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5363 // and re-instantiate original MXCSR.RC mode after that. 5364 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5365 5366 movl(tmp, jint_cast(0.5)); 5367 movq(xtmp1, tmp); 5368 vbroadcastss(xtmp1, xtmp1, vec_enc); 5369 vaddps(xtmp1, src , xtmp1, vec_enc); 5370 vcvtps2dq(dst, xtmp1, vec_enc); 5371 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5372 5373 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5374 } 5375 5376 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5377 BasicType from_elem_bt, BasicType to_elem_bt) { 5378 switch (from_elem_bt) { 5379 case T_BYTE: 5380 switch (to_elem_bt) { 5381 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5382 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5383 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5384 default: ShouldNotReachHere(); 5385 } 5386 break; 5387 case T_SHORT: 5388 switch (to_elem_bt) { 5389 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5390 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5391 default: ShouldNotReachHere(); 5392 } 5393 break; 5394 case T_INT: 5395 assert(to_elem_bt == T_LONG, ""); 5396 vpmovzxdq(dst, src, vlen_enc); 5397 break; 5398 default: 5399 ShouldNotReachHere(); 5400 } 5401 } 5402 5403 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5404 BasicType from_elem_bt, BasicType to_elem_bt) { 5405 switch (from_elem_bt) { 5406 case T_BYTE: 5407 switch (to_elem_bt) { 5408 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5409 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5410 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5411 default: ShouldNotReachHere(); 5412 } 5413 break; 5414 case T_SHORT: 5415 switch (to_elem_bt) { 5416 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5417 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5418 default: ShouldNotReachHere(); 5419 } 5420 break; 5421 case T_INT: 5422 assert(to_elem_bt == T_LONG, ""); 5423 vpmovsxdq(dst, src, vlen_enc); 5424 break; 5425 default: 5426 ShouldNotReachHere(); 5427 } 5428 } 5429 5430 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5431 BasicType dst_bt, BasicType src_bt, int vlen) { 5432 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5433 assert(vlen_enc != AVX_512bit, ""); 5434 5435 int dst_bt_size = type2aelembytes(dst_bt); 5436 int src_bt_size = type2aelembytes(src_bt); 5437 if (dst_bt_size > src_bt_size) { 5438 switch (dst_bt_size / src_bt_size) { 5439 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5440 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5441 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5442 default: ShouldNotReachHere(); 5443 } 5444 } else { 5445 assert(dst_bt_size < src_bt_size, ""); 5446 switch (src_bt_size / dst_bt_size) { 5447 case 2: { 5448 if (vlen_enc == AVX_128bit) { 5449 vpacksswb(dst, src, src, vlen_enc); 5450 } else { 5451 vpacksswb(dst, src, src, vlen_enc); 5452 vpermq(dst, dst, 0x08, vlen_enc); 5453 } 5454 break; 5455 } 5456 case 4: { 5457 if (vlen_enc == AVX_128bit) { 5458 vpackssdw(dst, src, src, vlen_enc); 5459 vpacksswb(dst, dst, dst, vlen_enc); 5460 } else { 5461 vpackssdw(dst, src, src, vlen_enc); 5462 vpermq(dst, dst, 0x08, vlen_enc); 5463 vpacksswb(dst, dst, dst, AVX_128bit); 5464 } 5465 break; 5466 } 5467 case 8: { 5468 if (vlen_enc == AVX_128bit) { 5469 vpshufd(dst, src, 0x08, vlen_enc); 5470 vpackssdw(dst, dst, dst, vlen_enc); 5471 vpacksswb(dst, dst, dst, vlen_enc); 5472 } else { 5473 vpshufd(dst, src, 0x08, vlen_enc); 5474 vpermq(dst, dst, 0x08, vlen_enc); 5475 vpackssdw(dst, dst, dst, AVX_128bit); 5476 vpacksswb(dst, dst, dst, AVX_128bit); 5477 } 5478 break; 5479 } 5480 default: ShouldNotReachHere(); 5481 } 5482 } 5483 } 5484 5485 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5486 bool merge, BasicType bt, int vlen_enc) { 5487 if (bt == T_INT) { 5488 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5489 } else { 5490 assert(bt == T_LONG, ""); 5491 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5492 } 5493 } 5494 5495 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5496 bool merge, BasicType bt, int vlen_enc) { 5497 if (bt == T_INT) { 5498 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5499 } else { 5500 assert(bt == T_LONG, ""); 5501 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5502 } 5503 } 5504 5505 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5506 Register rtmp2, XMMRegister xtmp, int mask_len, 5507 int vec_enc) { 5508 int index = 0; 5509 int vindex = 0; 5510 mov64(rtmp1, 0x0101010101010101L); 5511 pdepq(rtmp1, src, rtmp1); 5512 if (mask_len > 8) { 5513 movq(rtmp2, src); 5514 vpxor(xtmp, xtmp, xtmp, vec_enc); 5515 movq(xtmp, rtmp1); 5516 } 5517 movq(dst, rtmp1); 5518 5519 mask_len -= 8; 5520 while (mask_len > 0) { 5521 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5522 index++; 5523 if ((index % 2) == 0) { 5524 pxor(xtmp, xtmp); 5525 } 5526 mov64(rtmp1, 0x0101010101010101L); 5527 shrq(rtmp2, 8); 5528 pdepq(rtmp1, rtmp2, rtmp1); 5529 pinsrq(xtmp, rtmp1, index % 2); 5530 vindex = index / 2; 5531 if (vindex) { 5532 // Write entire 16 byte vector when both 64 bit 5533 // lanes are update to save redundant instructions. 5534 if (index % 2) { 5535 vinsertf128(dst, dst, xtmp, vindex); 5536 } 5537 } else { 5538 vmovdqu(dst, xtmp); 5539 } 5540 mask_len -= 8; 5541 } 5542 } 5543 5544 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5545 switch(opc) { 5546 case Op_VectorMaskTrueCount: 5547 popcntq(dst, tmp); 5548 break; 5549 case Op_VectorMaskLastTrue: 5550 if (VM_Version::supports_lzcnt()) { 5551 lzcntq(tmp, tmp); 5552 movl(dst, 63); 5553 subl(dst, tmp); 5554 } else { 5555 movl(dst, -1); 5556 bsrq(tmp, tmp); 5557 cmov32(Assembler::notZero, dst, tmp); 5558 } 5559 break; 5560 case Op_VectorMaskFirstTrue: 5561 if (UseCountTrailingZerosInstruction) { 5562 if (masklen < 32) { 5563 orl(tmp, 1 << masklen); 5564 tzcntl(dst, tmp); 5565 } else if (masklen == 32) { 5566 tzcntl(dst, tmp); 5567 } else { 5568 assert(masklen == 64, ""); 5569 tzcntq(dst, tmp); 5570 } 5571 } else { 5572 if (masklen < 32) { 5573 orl(tmp, 1 << masklen); 5574 bsfl(dst, tmp); 5575 } else { 5576 assert(masklen == 32 || masklen == 64, ""); 5577 movl(dst, masklen); 5578 if (masklen == 32) { 5579 bsfl(tmp, tmp); 5580 } else { 5581 bsfq(tmp, tmp); 5582 } 5583 cmov32(Assembler::notZero, dst, tmp); 5584 } 5585 } 5586 break; 5587 case Op_VectorMaskToLong: 5588 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5589 break; 5590 default: assert(false, "Unhandled mask operation"); 5591 } 5592 } 5593 5594 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5595 int masklen, int masksize, int vec_enc) { 5596 assert(VM_Version::supports_popcnt(), ""); 5597 5598 if(VM_Version::supports_avx512bw()) { 5599 kmovql(tmp, mask); 5600 } else { 5601 assert(masklen <= 16, ""); 5602 kmovwl(tmp, mask); 5603 } 5604 5605 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5606 // operations needs to be clipped. 5607 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5608 andq(tmp, (1 << masklen) - 1); 5609 } 5610 5611 vector_mask_operation_helper(opc, dst, tmp, masklen); 5612 } 5613 5614 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5615 Register tmp, int masklen, BasicType bt, int vec_enc) { 5616 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5617 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5618 assert(VM_Version::supports_popcnt(), ""); 5619 5620 bool need_clip = false; 5621 switch(bt) { 5622 case T_BOOLEAN: 5623 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5624 vpxor(xtmp, xtmp, xtmp, vec_enc); 5625 vpsubb(xtmp, xtmp, mask, vec_enc); 5626 vpmovmskb(tmp, xtmp, vec_enc); 5627 need_clip = masklen < 16; 5628 break; 5629 case T_BYTE: 5630 vpmovmskb(tmp, mask, vec_enc); 5631 need_clip = masklen < 16; 5632 break; 5633 case T_SHORT: 5634 vpacksswb(xtmp, mask, mask, vec_enc); 5635 if (masklen >= 16) { 5636 vpermpd(xtmp, xtmp, 8, vec_enc); 5637 } 5638 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5639 need_clip = masklen < 16; 5640 break; 5641 case T_INT: 5642 case T_FLOAT: 5643 vmovmskps(tmp, mask, vec_enc); 5644 need_clip = masklen < 4; 5645 break; 5646 case T_LONG: 5647 case T_DOUBLE: 5648 vmovmskpd(tmp, mask, vec_enc); 5649 need_clip = masklen < 2; 5650 break; 5651 default: assert(false, "Unhandled type, %s", type2name(bt)); 5652 } 5653 5654 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5655 // operations needs to be clipped. 5656 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5657 // need_clip implies masklen < 32 5658 andq(tmp, (1 << masklen) - 1); 5659 } 5660 5661 vector_mask_operation_helper(opc, dst, tmp, masklen); 5662 } 5663 5664 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5665 Register rtmp2, int mask_len) { 5666 kmov(rtmp1, src); 5667 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5668 mov64(rtmp2, -1L); 5669 pextq(rtmp2, rtmp2, rtmp1); 5670 kmov(dst, rtmp2); 5671 } 5672 5673 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5674 XMMRegister mask, Register rtmp, Register rscratch, 5675 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5676 int vec_enc) { 5677 assert(type2aelembytes(bt) >= 4, ""); 5678 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5679 address compress_perm_table = nullptr; 5680 address expand_perm_table = nullptr; 5681 if (type2aelembytes(bt) == 8) { 5682 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5683 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5684 vmovmskpd(rtmp, mask, vec_enc); 5685 } else { 5686 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5687 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5688 vmovmskps(rtmp, mask, vec_enc); 5689 } 5690 shlq(rtmp, 5); // for 32 byte permute row. 5691 if (opcode == Op_CompressV) { 5692 lea(rscratch, ExternalAddress(compress_perm_table)); 5693 } else { 5694 lea(rscratch, ExternalAddress(expand_perm_table)); 5695 } 5696 addptr(rtmp, rscratch); 5697 vmovdqu(permv, Address(rtmp)); 5698 vpermps(dst, permv, src, Assembler::AVX_256bit); 5699 vpxor(xtmp, xtmp, xtmp, vec_enc); 5700 // Blend the result with zero vector using permute mask, each column entry 5701 // in a permute table row contains either a valid permute index or a -1 (default) 5702 // value, this can potentially be used as a blending mask after 5703 // compressing/expanding the source vector lanes. 5704 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv); 5705 } 5706 5707 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5708 bool merge, BasicType bt, int vec_enc) { 5709 if (opcode == Op_CompressV) { 5710 switch(bt) { 5711 case T_BYTE: 5712 evpcompressb(dst, mask, src, merge, vec_enc); 5713 break; 5714 case T_CHAR: 5715 case T_SHORT: 5716 evpcompressw(dst, mask, src, merge, vec_enc); 5717 break; 5718 case T_INT: 5719 evpcompressd(dst, mask, src, merge, vec_enc); 5720 break; 5721 case T_FLOAT: 5722 evcompressps(dst, mask, src, merge, vec_enc); 5723 break; 5724 case T_LONG: 5725 evpcompressq(dst, mask, src, merge, vec_enc); 5726 break; 5727 case T_DOUBLE: 5728 evcompresspd(dst, mask, src, merge, vec_enc); 5729 break; 5730 default: 5731 fatal("Unsupported type %s", type2name(bt)); 5732 break; 5733 } 5734 } else { 5735 assert(opcode == Op_ExpandV, ""); 5736 switch(bt) { 5737 case T_BYTE: 5738 evpexpandb(dst, mask, src, merge, vec_enc); 5739 break; 5740 case T_CHAR: 5741 case T_SHORT: 5742 evpexpandw(dst, mask, src, merge, vec_enc); 5743 break; 5744 case T_INT: 5745 evpexpandd(dst, mask, src, merge, vec_enc); 5746 break; 5747 case T_FLOAT: 5748 evexpandps(dst, mask, src, merge, vec_enc); 5749 break; 5750 case T_LONG: 5751 evpexpandq(dst, mask, src, merge, vec_enc); 5752 break; 5753 case T_DOUBLE: 5754 evexpandpd(dst, mask, src, merge, vec_enc); 5755 break; 5756 default: 5757 fatal("Unsupported type %s", type2name(bt)); 5758 break; 5759 } 5760 } 5761 } 5762 5763 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5764 KRegister ktmp1, int vec_enc) { 5765 if (opcode == Op_SignumVD) { 5766 vsubpd(dst, zero, one, vec_enc); 5767 // if src < 0 ? -1 : 1 5768 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5769 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5770 // if src == NaN, -0.0 or 0.0 return src. 5771 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5772 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5773 } else { 5774 assert(opcode == Op_SignumVF, ""); 5775 vsubps(dst, zero, one, vec_enc); 5776 // if src < 0 ? -1 : 1 5777 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5778 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5779 // if src == NaN, -0.0 or 0.0 return src. 5780 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5781 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5782 } 5783 } 5784 5785 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5786 XMMRegister xtmp1, int vec_enc) { 5787 if (opcode == Op_SignumVD) { 5788 vsubpd(dst, zero, one, vec_enc); 5789 // if src < 0 ? -1 : 1 5790 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5791 // if src == NaN, -0.0 or 0.0 return src. 5792 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5793 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5794 } else { 5795 assert(opcode == Op_SignumVF, ""); 5796 vsubps(dst, zero, one, vec_enc); 5797 // if src < 0 ? -1 : 1 5798 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5799 // if src == NaN, -0.0 or 0.0 return src. 5800 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5801 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5802 } 5803 } 5804 5805 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5806 if (VM_Version::supports_avx512bw()) { 5807 if (mask_len > 32) { 5808 kmovql(dst, src); 5809 } else { 5810 kmovdl(dst, src); 5811 if (mask_len != 32) { 5812 kshiftrdl(dst, dst, 32 - mask_len); 5813 } 5814 } 5815 } else { 5816 assert(mask_len <= 16, ""); 5817 kmovwl(dst, src); 5818 if (mask_len != 16) { 5819 kshiftrwl(dst, dst, 16 - mask_len); 5820 } 5821 } 5822 } 5823 5824 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5825 int lane_size = type2aelembytes(bt); 5826 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5827 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) { 5828 movptr(rtmp, imm32); 5829 switch(lane_size) { 5830 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5831 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5832 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5833 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5834 fatal("Unsupported lane size %d", lane_size); 5835 break; 5836 } 5837 } else { 5838 movptr(rtmp, imm32); 5839 movq(dst, rtmp); 5840 switch(lane_size) { 5841 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5842 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5843 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5844 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5845 fatal("Unsupported lane size %d", lane_size); 5846 break; 5847 } 5848 } 5849 } 5850 5851 // 5852 // Following is lookup table based popcount computation algorithm:- 5853 // Index Bit set count 5854 // [ 0000 -> 0, 5855 // 0001 -> 1, 5856 // 0010 -> 1, 5857 // 0011 -> 2, 5858 // 0100 -> 1, 5859 // 0101 -> 2, 5860 // 0110 -> 2, 5861 // 0111 -> 3, 5862 // 1000 -> 1, 5863 // 1001 -> 2, 5864 // 1010 -> 3, 5865 // 1011 -> 3, 5866 // 1100 -> 2, 5867 // 1101 -> 3, 5868 // 1111 -> 4 ] 5869 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5870 // shuffle indices for lookup table access. 5871 // b. Right shift each byte of vector lane by 4 positions. 5872 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5873 // shuffle indices for lookup table access. 5874 // d. Add the bitset count of upper and lower 4 bits of each byte. 5875 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5876 // count of all the bytes of a quadword. 5877 // f. Perform step e. for upper 128bit vector lane. 5878 // g. Pack the bitset count of quadwords back to double word. 5879 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5880 5881 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5882 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5883 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5884 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5885 vpsrlw(dst, src, 4, vec_enc); 5886 vpand(dst, dst, xtmp1, vec_enc); 5887 vpand(xtmp1, src, xtmp1, vec_enc); 5888 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5889 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5890 vpshufb(dst, xtmp2, dst, vec_enc); 5891 vpaddb(dst, dst, xtmp1, vec_enc); 5892 } 5893 5894 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5895 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5896 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5897 // Following code is as per steps e,f,g and h of above algorithm. 5898 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5899 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5900 vpsadbw(dst, dst, xtmp2, vec_enc); 5901 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5902 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5903 vpackuswb(dst, xtmp1, dst, vec_enc); 5904 } 5905 5906 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5907 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5908 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5909 // Add the popcount of upper and lower bytes of word. 5910 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5911 vpsrlw(dst, xtmp1, 8, vec_enc); 5912 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5913 vpaddw(dst, dst, xtmp1, vec_enc); 5914 } 5915 5916 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5917 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5918 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5919 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5920 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5921 } 5922 5923 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5924 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5925 switch(bt) { 5926 case T_LONG: 5927 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5928 break; 5929 case T_INT: 5930 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5931 break; 5932 case T_CHAR: 5933 case T_SHORT: 5934 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5935 break; 5936 case T_BYTE: 5937 case T_BOOLEAN: 5938 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5939 break; 5940 default: 5941 fatal("Unsupported type %s", type2name(bt)); 5942 break; 5943 } 5944 } 5945 5946 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5947 KRegister mask, bool merge, int vec_enc) { 5948 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5949 switch(bt) { 5950 case T_LONG: 5951 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5952 evpopcntq(dst, mask, src, merge, vec_enc); 5953 break; 5954 case T_INT: 5955 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5956 evpopcntd(dst, mask, src, merge, vec_enc); 5957 break; 5958 case T_CHAR: 5959 case T_SHORT: 5960 assert(VM_Version::supports_avx512_bitalg(), ""); 5961 evpopcntw(dst, mask, src, merge, vec_enc); 5962 break; 5963 case T_BYTE: 5964 case T_BOOLEAN: 5965 assert(VM_Version::supports_avx512_bitalg(), ""); 5966 evpopcntb(dst, mask, src, merge, vec_enc); 5967 break; 5968 default: 5969 fatal("Unsupported type %s", type2name(bt)); 5970 break; 5971 } 5972 } 5973 5974 // Bit reversal algorithm first reverses the bits of each byte followed by 5975 // a byte level reversal for multi-byte primitive types (short/int/long). 5976 // Algorithm performs a lookup table access to get reverse bit sequence 5977 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5978 // is obtained by swapping the reverse bit sequences of upper and lower 5979 // nibble of a byte. 5980 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5981 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5982 if (VM_Version::supports_avx512vlbw()) { 5983 5984 // Get the reverse bit sequence of lower nibble of each byte. 5985 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5986 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5987 evpandq(dst, xtmp2, src, vec_enc); 5988 vpshufb(dst, xtmp1, dst, vec_enc); 5989 vpsllq(dst, dst, 4, vec_enc); 5990 5991 // Get the reverse bit sequence of upper nibble of each byte. 5992 vpandn(xtmp2, xtmp2, src, vec_enc); 5993 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5994 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5995 5996 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5997 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5998 evporq(xtmp2, dst, xtmp2, vec_enc); 5999 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6000 6001 } else if(vec_enc == Assembler::AVX_512bit) { 6002 // Shift based bit reversal. 6003 assert(bt == T_LONG || bt == T_INT, ""); 6004 6005 // Swap lower and upper nibble of each byte. 6006 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 6007 6008 // Swap two least and most significant bits of each nibble. 6009 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 6010 6011 // Swap adjacent pair of bits. 6012 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6013 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 6014 6015 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6016 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 6017 } else { 6018 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 6019 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6020 6021 // Get the reverse bit sequence of lower nibble of each byte. 6022 vpand(dst, xtmp2, src, vec_enc); 6023 vpshufb(dst, xtmp1, dst, vec_enc); 6024 vpsllq(dst, dst, 4, vec_enc); 6025 6026 // Get the reverse bit sequence of upper nibble of each byte. 6027 vpandn(xtmp2, xtmp2, src, vec_enc); 6028 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6029 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6030 6031 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6032 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6033 vpor(xtmp2, dst, xtmp2, vec_enc); 6034 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6035 } 6036 } 6037 6038 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 6039 XMMRegister xtmp, Register rscratch) { 6040 assert(VM_Version::supports_gfni(), ""); 6041 assert(rscratch != noreg || always_reachable(mask), "missing"); 6042 6043 // Galois field instruction based bit reversal based on following algorithm. 6044 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6045 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 6046 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 6047 vector_reverse_byte(bt, dst, xtmp, vec_enc); 6048 } 6049 6050 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 6051 XMMRegister xtmp1, Register rtmp, int vec_enc) { 6052 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 6053 evpandq(dst, xtmp1, src, vec_enc); 6054 vpsllq(dst, dst, nbits, vec_enc); 6055 vpandn(xtmp1, xtmp1, src, vec_enc); 6056 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 6057 evporq(dst, dst, xtmp1, vec_enc); 6058 } 6059 6060 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6061 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6062 // Shift based bit reversal. 6063 assert(VM_Version::supports_evex(), ""); 6064 switch(bt) { 6065 case T_LONG: 6066 // Swap upper and lower double word of each quad word. 6067 evprorq(xtmp1, k0, src, 32, true, vec_enc); 6068 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 6069 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6070 break; 6071 case T_INT: 6072 // Swap upper and lower word of each double word. 6073 evprord(xtmp1, k0, src, 16, true, vec_enc); 6074 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6075 break; 6076 case T_CHAR: 6077 case T_SHORT: 6078 // Swap upper and lower byte of each word. 6079 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6080 break; 6081 case T_BYTE: 6082 evmovdquq(dst, k0, src, true, vec_enc); 6083 break; 6084 default: 6085 fatal("Unsupported type %s", type2name(bt)); 6086 break; 6087 } 6088 } 6089 6090 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6091 if (bt == T_BYTE) { 6092 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6093 evmovdquq(dst, k0, src, true, vec_enc); 6094 } else { 6095 vmovdqu(dst, src); 6096 } 6097 return; 6098 } 6099 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6100 // pre-computed shuffle indices. 6101 switch(bt) { 6102 case T_LONG: 6103 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6104 break; 6105 case T_INT: 6106 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6107 break; 6108 case T_CHAR: 6109 case T_SHORT: 6110 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6111 break; 6112 default: 6113 fatal("Unsupported type %s", type2name(bt)); 6114 break; 6115 } 6116 vpshufb(dst, src, dst, vec_enc); 6117 } 6118 6119 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6120 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6121 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6122 assert(is_integral_type(bt), ""); 6123 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6124 assert(VM_Version::supports_avx512cd(), ""); 6125 switch(bt) { 6126 case T_LONG: 6127 evplzcntq(dst, ktmp, src, merge, vec_enc); 6128 break; 6129 case T_INT: 6130 evplzcntd(dst, ktmp, src, merge, vec_enc); 6131 break; 6132 case T_SHORT: 6133 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6134 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6135 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6136 vpunpckhwd(dst, xtmp1, src, vec_enc); 6137 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6138 vpackusdw(dst, xtmp2, dst, vec_enc); 6139 break; 6140 case T_BYTE: 6141 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6142 // accessing the lookup table. 6143 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6144 // accessing the lookup table. 6145 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6146 assert(VM_Version::supports_avx512bw(), ""); 6147 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6148 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6149 vpand(xtmp2, dst, src, vec_enc); 6150 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6151 vpsrlw(xtmp3, src, 4, vec_enc); 6152 vpand(xtmp3, dst, xtmp3, vec_enc); 6153 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6154 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6155 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6156 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6157 break; 6158 default: 6159 fatal("Unsupported type %s", type2name(bt)); 6160 break; 6161 } 6162 } 6163 6164 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6165 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6166 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6167 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6168 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6169 // accessing the lookup table. 6170 vpand(dst, xtmp2, src, vec_enc); 6171 vpshufb(dst, xtmp1, dst, vec_enc); 6172 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6173 // accessing the lookup table. 6174 vpsrlw(xtmp3, src, 4, vec_enc); 6175 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6176 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6177 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6178 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6179 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6180 vpaddb(dst, dst, xtmp2, vec_enc); 6181 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6182 } 6183 6184 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6185 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6186 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6187 // Add zero counts of lower byte and upper byte of a word if 6188 // upper byte holds a zero value. 6189 vpsrlw(xtmp3, src, 8, vec_enc); 6190 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6191 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6192 vpsllw(xtmp2, dst, 8, vec_enc); 6193 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6194 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6195 vpsrlw(dst, dst, 8, vec_enc); 6196 } 6197 6198 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6199 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6200 // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float. 6201 // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the 6202 // exponent as the leading zero count. 6203 6204 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher 6205 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit 6206 // contributes to the leading number of zeros. 6207 vpsrld(dst, src, 1, vec_enc); 6208 vpandn(dst, dst, src, vec_enc); 6209 6210 vcvtdq2ps(dst, dst, vec_enc); 6211 6212 // By comparing the register to itself, all the bits in the destination are set. 6213 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6214 6215 // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit. 6216 vpsrld(xtmp2, xtmp1, 24, vec_enc); 6217 vpsrld(dst, dst, 23, vec_enc); 6218 vpand(dst, xtmp2, dst, vec_enc); 6219 6220 // Subtract 127 from the exponent, which removes the bias from the exponent. 6221 vpsrld(xtmp2, xtmp1, 25, vec_enc); 6222 vpsubd(dst, dst, xtmp2, vec_enc); 6223 6224 vpsrld(xtmp2, xtmp1, 27, vec_enc); 6225 6226 // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this 6227 // is found in any of the lanes, replace the lane with -1 from xtmp1. 6228 vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3); 6229 6230 // If the original value is negative, replace the lane with 31. 6231 vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3); 6232 6233 // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1, 6234 // and for negative numbers the result is 0 as the exponent was replaced with 31. 6235 vpsubd(dst, xtmp2, dst, vec_enc); 6236 } 6237 6238 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6239 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6240 // Find the leading zeros of the top and bottom halves of the long individually. 6241 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6242 6243 // Move the top half result to the bottom half of xtmp1, setting the top half to 0. 6244 vpsrlq(xtmp1, dst, 32, vec_enc); 6245 // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will 6246 // be in the most significant position of the bottom half. 6247 vpsrlq(xtmp2, dst, 6, vec_enc); 6248 6249 // In the bottom half, add the top half and bottom half results. 6250 vpaddq(dst, xtmp1, dst, vec_enc); 6251 6252 // For the bottom half, choose between the values using the most significant bit of xtmp2. 6253 // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen, 6254 // which contains only the top half result. 6255 // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears 6256 // the lane as required. 6257 vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3); 6258 } 6259 6260 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6261 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6262 Register rtmp, int vec_enc) { 6263 assert(is_integral_type(bt), "unexpected type"); 6264 assert(vec_enc < Assembler::AVX_512bit, ""); 6265 switch(bt) { 6266 case T_LONG: 6267 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6268 break; 6269 case T_INT: 6270 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6271 break; 6272 case T_SHORT: 6273 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6274 break; 6275 case T_BYTE: 6276 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6277 break; 6278 default: 6279 fatal("Unsupported type %s", type2name(bt)); 6280 break; 6281 } 6282 } 6283 6284 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6285 switch(bt) { 6286 case T_BYTE: 6287 vpsubb(dst, src1, src2, vec_enc); 6288 break; 6289 case T_SHORT: 6290 vpsubw(dst, src1, src2, vec_enc); 6291 break; 6292 case T_INT: 6293 vpsubd(dst, src1, src2, vec_enc); 6294 break; 6295 case T_LONG: 6296 vpsubq(dst, src1, src2, vec_enc); 6297 break; 6298 default: 6299 fatal("Unsupported type %s", type2name(bt)); 6300 break; 6301 } 6302 } 6303 6304 // Trailing zero count computation is based on leading zero count operation as per 6305 // following equation. All AVX3 targets support AVX512CD feature which offers 6306 // direct vector instruction to compute leading zero count. 6307 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6308 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6309 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6310 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6311 assert(is_integral_type(bt), ""); 6312 // xtmp = -1 6313 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6314 // xtmp = xtmp + src 6315 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6316 // xtmp = xtmp & ~src 6317 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6318 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6319 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6320 vpsub(bt, dst, xtmp4, dst, vec_enc); 6321 } 6322 6323 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6324 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6325 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6326 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6327 assert(is_integral_type(bt), ""); 6328 // xtmp = 0 6329 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6330 // xtmp = 0 - src 6331 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6332 // xtmp = xtmp | src 6333 vpor(xtmp3, xtmp3, src, vec_enc); 6334 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6335 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6336 vpsub(bt, dst, xtmp1, dst, vec_enc); 6337 } 6338 6339 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6340 Label done; 6341 Label neg_divisor_fastpath; 6342 cmpl(divisor, 0); 6343 jccb(Assembler::less, neg_divisor_fastpath); 6344 xorl(rdx, rdx); 6345 divl(divisor); 6346 jmpb(done); 6347 bind(neg_divisor_fastpath); 6348 // Fastpath for divisor < 0: 6349 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6350 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6351 movl(rdx, rax); 6352 subl(rdx, divisor); 6353 if (VM_Version::supports_bmi1() && VM_Version::supports_avx()) { 6354 andnl(rax, rdx, rax); 6355 } else { 6356 notl(rdx); 6357 andl(rax, rdx); 6358 } 6359 shrl(rax, 31); 6360 bind(done); 6361 } 6362 6363 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6364 Label done; 6365 Label neg_divisor_fastpath; 6366 cmpl(divisor, 0); 6367 jccb(Assembler::less, neg_divisor_fastpath); 6368 xorl(rdx, rdx); 6369 divl(divisor); 6370 jmpb(done); 6371 bind(neg_divisor_fastpath); 6372 // Fastpath when divisor < 0: 6373 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6374 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6375 movl(rdx, rax); 6376 subl(rax, divisor); 6377 if (VM_Version::supports_bmi1() && VM_Version::supports_avx()) { 6378 andnl(rax, rax, rdx); 6379 } else { 6380 notl(rax); 6381 andl(rax, rdx); 6382 } 6383 sarl(rax, 31); 6384 andl(rax, divisor); 6385 subl(rdx, rax); 6386 bind(done); 6387 } 6388 6389 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6390 Label done; 6391 Label neg_divisor_fastpath; 6392 6393 cmpl(divisor, 0); 6394 jccb(Assembler::less, neg_divisor_fastpath); 6395 xorl(rdx, rdx); 6396 divl(divisor); 6397 jmpb(done); 6398 bind(neg_divisor_fastpath); 6399 // Fastpath for divisor < 0: 6400 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6401 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6402 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6403 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6404 movl(rdx, rax); 6405 subl(rax, divisor); 6406 if (VM_Version::supports_bmi1() && VM_Version::supports_avx()) { 6407 andnl(rax, rax, rdx); 6408 } else { 6409 notl(rax); 6410 andl(rax, rdx); 6411 } 6412 movl(tmp, rax); 6413 shrl(rax, 31); // quotient 6414 sarl(tmp, 31); 6415 andl(tmp, divisor); 6416 subl(rdx, tmp); // remainder 6417 bind(done); 6418 } 6419 6420 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6421 XMMRegister xtmp2, Register rtmp) { 6422 if(VM_Version::supports_gfni()) { 6423 // Galois field instruction based bit reversal based on following algorithm. 6424 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6425 mov64(rtmp, 0x8040201008040201L); 6426 movq(xtmp1, src); 6427 movq(xtmp2, rtmp); 6428 gf2p8affineqb(xtmp1, xtmp2, 0); 6429 movq(dst, xtmp1); 6430 } else { 6431 // Swap even and odd numbered bits. 6432 movl(rtmp, src); 6433 andl(rtmp, 0x55555555); 6434 shll(rtmp, 1); 6435 movl(dst, src); 6436 andl(dst, 0xAAAAAAAA); 6437 shrl(dst, 1); 6438 orl(dst, rtmp); 6439 6440 // Swap LSB and MSB 2 bits of each nibble. 6441 movl(rtmp, dst); 6442 andl(rtmp, 0x33333333); 6443 shll(rtmp, 2); 6444 andl(dst, 0xCCCCCCCC); 6445 shrl(dst, 2); 6446 orl(dst, rtmp); 6447 6448 // Swap LSB and MSB 4 bits of each byte. 6449 movl(rtmp, dst); 6450 andl(rtmp, 0x0F0F0F0F); 6451 shll(rtmp, 4); 6452 andl(dst, 0xF0F0F0F0); 6453 shrl(dst, 4); 6454 orl(dst, rtmp); 6455 } 6456 bswapl(dst); 6457 } 6458 6459 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6460 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6461 if(VM_Version::supports_gfni()) { 6462 // Galois field instruction based bit reversal based on following algorithm. 6463 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6464 mov64(rtmp1, 0x8040201008040201L); 6465 movq(xtmp1, src); 6466 movq(xtmp2, rtmp1); 6467 gf2p8affineqb(xtmp1, xtmp2, 0); 6468 movq(dst, xtmp1); 6469 } else { 6470 // Swap even and odd numbered bits. 6471 movq(rtmp1, src); 6472 mov64(rtmp2, 0x5555555555555555L); 6473 andq(rtmp1, rtmp2); 6474 shlq(rtmp1, 1); 6475 movq(dst, src); 6476 notq(rtmp2); 6477 andq(dst, rtmp2); 6478 shrq(dst, 1); 6479 orq(dst, rtmp1); 6480 6481 // Swap LSB and MSB 2 bits of each nibble. 6482 movq(rtmp1, dst); 6483 mov64(rtmp2, 0x3333333333333333L); 6484 andq(rtmp1, rtmp2); 6485 shlq(rtmp1, 2); 6486 notq(rtmp2); 6487 andq(dst, rtmp2); 6488 shrq(dst, 2); 6489 orq(dst, rtmp1); 6490 6491 // Swap LSB and MSB 4 bits of each byte. 6492 movq(rtmp1, dst); 6493 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6494 andq(rtmp1, rtmp2); 6495 shlq(rtmp1, 4); 6496 notq(rtmp2); 6497 andq(dst, rtmp2); 6498 shrq(dst, 4); 6499 orq(dst, rtmp1); 6500 } 6501 bswapq(dst); 6502 } 6503 6504 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6505 Label done; 6506 Label neg_divisor_fastpath; 6507 cmpq(divisor, 0); 6508 jccb(Assembler::less, neg_divisor_fastpath); 6509 xorl(rdx, rdx); 6510 divq(divisor); 6511 jmpb(done); 6512 bind(neg_divisor_fastpath); 6513 // Fastpath for divisor < 0: 6514 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6515 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6516 movq(rdx, rax); 6517 subq(rdx, divisor); 6518 if (VM_Version::supports_bmi1() && VM_Version::supports_avx()) { 6519 andnq(rax, rdx, rax); 6520 } else { 6521 notq(rdx); 6522 andq(rax, rdx); 6523 } 6524 shrq(rax, 63); 6525 bind(done); 6526 } 6527 6528 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6529 Label done; 6530 Label neg_divisor_fastpath; 6531 cmpq(divisor, 0); 6532 jccb(Assembler::less, neg_divisor_fastpath); 6533 xorq(rdx, rdx); 6534 divq(divisor); 6535 jmp(done); 6536 bind(neg_divisor_fastpath); 6537 // Fastpath when divisor < 0: 6538 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6539 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6540 movq(rdx, rax); 6541 subq(rax, divisor); 6542 if (VM_Version::supports_bmi1() && VM_Version::supports_avx()) { 6543 andnq(rax, rax, rdx); 6544 } else { 6545 notq(rax); 6546 andq(rax, rdx); 6547 } 6548 sarq(rax, 63); 6549 andq(rax, divisor); 6550 subq(rdx, rax); 6551 bind(done); 6552 } 6553 6554 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6555 Label done; 6556 Label neg_divisor_fastpath; 6557 cmpq(divisor, 0); 6558 jccb(Assembler::less, neg_divisor_fastpath); 6559 xorq(rdx, rdx); 6560 divq(divisor); 6561 jmp(done); 6562 bind(neg_divisor_fastpath); 6563 // Fastpath for divisor < 0: 6564 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6565 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6566 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6567 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6568 movq(rdx, rax); 6569 subq(rax, divisor); 6570 if (VM_Version::supports_bmi1() && VM_Version::supports_avx()) { 6571 andnq(rax, rax, rdx); 6572 } else { 6573 notq(rax); 6574 andq(rax, rdx); 6575 } 6576 movq(tmp, rax); 6577 shrq(rax, 63); // quotient 6578 sarq(tmp, 63); 6579 andq(tmp, divisor); 6580 subq(rdx, tmp); // remainder 6581 bind(done); 6582 } 6583 6584 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6585 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6586 int vlen_enc) { 6587 assert(VM_Version::supports_avx512bw(), ""); 6588 // Byte shuffles are inlane operations and indices are determined using 6589 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6590 // normalized to index range 0-15. This makes sure that all the multiples 6591 // of an index value are placed at same relative position in 128 bit 6592 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6593 // will be 16th element in their respective 128 bit lanes. 6594 movl(rtmp, 16); 6595 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6596 6597 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6598 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6599 // original shuffle indices and move the shuffled lanes corresponding to true 6600 // mask to destination vector. 6601 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6602 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6603 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6604 6605 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6606 // and broadcasting second 128 bit lane. 6607 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6608 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6609 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6610 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6611 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6612 6613 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6614 // and broadcasting third 128 bit lane. 6615 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6616 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6617 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6618 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6619 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6620 6621 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6622 // and broadcasting third 128 bit lane. 6623 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6624 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6625 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6626 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6627 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6628 } 6629 6630 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6631 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6632 if (vlen_enc == AVX_128bit) { 6633 vpermilps(dst, src, shuffle, vlen_enc); 6634 } else if (bt == T_INT) { 6635 vpermd(dst, shuffle, src, vlen_enc); 6636 } else { 6637 assert(bt == T_FLOAT, ""); 6638 vpermps(dst, shuffle, src, vlen_enc); 6639 } 6640 } 6641 6642 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 6643 switch(opcode) { 6644 case Op_AddHF: vaddsh(dst, src1, src2); break; 6645 case Op_SubHF: vsubsh(dst, src1, src2); break; 6646 case Op_MulHF: vmulsh(dst, src1, src2); break; 6647 case Op_DivHF: vdivsh(dst, src1, src2); break; 6648 default: assert(false, "%s", NodeClassNames[opcode]); break; 6649 } 6650 } 6651 6652 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6653 switch(elem_bt) { 6654 case T_BYTE: 6655 if (ideal_opc == Op_SaturatingAddV) { 6656 vpaddsb(dst, src1, src2, vlen_enc); 6657 } else { 6658 assert(ideal_opc == Op_SaturatingSubV, ""); 6659 vpsubsb(dst, src1, src2, vlen_enc); 6660 } 6661 break; 6662 case T_SHORT: 6663 if (ideal_opc == Op_SaturatingAddV) { 6664 vpaddsw(dst, src1, src2, vlen_enc); 6665 } else { 6666 assert(ideal_opc == Op_SaturatingSubV, ""); 6667 vpsubsw(dst, src1, src2, vlen_enc); 6668 } 6669 break; 6670 default: 6671 fatal("Unsupported type %s", type2name(elem_bt)); 6672 break; 6673 } 6674 } 6675 6676 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6677 switch(elem_bt) { 6678 case T_BYTE: 6679 if (ideal_opc == Op_SaturatingAddV) { 6680 vpaddusb(dst, src1, src2, vlen_enc); 6681 } else { 6682 assert(ideal_opc == Op_SaturatingSubV, ""); 6683 vpsubusb(dst, src1, src2, vlen_enc); 6684 } 6685 break; 6686 case T_SHORT: 6687 if (ideal_opc == Op_SaturatingAddV) { 6688 vpaddusw(dst, src1, src2, vlen_enc); 6689 } else { 6690 assert(ideal_opc == Op_SaturatingSubV, ""); 6691 vpsubusw(dst, src1, src2, vlen_enc); 6692 } 6693 break; 6694 default: 6695 fatal("Unsupported type %s", type2name(elem_bt)); 6696 break; 6697 } 6698 } 6699 6700 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6701 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6702 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6703 // overflow_mask = Inp1 <u Inp2 6704 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6705 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6706 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6707 } 6708 6709 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6710 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6711 // Emulate unsigned comparison using signed comparison 6712 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6713 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6714 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6715 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6716 6717 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6718 6719 // Res = INP1 - INP2 (non-commutative and non-associative) 6720 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6721 // Res = Mask ? Zero : Res 6722 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6723 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6724 } 6725 6726 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6727 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6728 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6729 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6730 // Res = Signed Add INP1, INP2 6731 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6732 // T1 = SRC1 | SRC2 6733 vpor(xtmp1, src1, src2, vlen_enc); 6734 // Max_Unsigned = -1 6735 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6736 // Unsigned compare: Mask = Res <u T1 6737 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6738 // res = Mask ? Max_Unsigned : Res 6739 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6740 } 6741 6742 // 6743 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6744 // unsigned addition operation. 6745 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6746 // 6747 // We empirically determined its semantic equivalence to following reduced expression 6748 // overflow_mask = (a + b) <u (a | b) 6749 // 6750 // and also verified it though Alive2 solver. 6751 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6752 // 6753 6754 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6755 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6756 // Res = Signed Add INP1, INP2 6757 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6758 // Compute T1 = INP1 | INP2 6759 vpor(xtmp3, src1, src2, vlen_enc); 6760 // T1 = Minimum signed value. 6761 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6762 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6763 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6764 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6765 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6766 // Compute overflow detection mask = Res<1> <s T1 6767 if (elem_bt == T_INT) { 6768 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6769 } else { 6770 assert(elem_bt == T_LONG, ""); 6771 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6772 } 6773 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6774 } 6775 6776 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6777 int vlen_enc, bool xtmp2_hold_M1) { 6778 if (VM_Version::supports_avx512dq()) { 6779 evpmovq2m(ktmp, src, vlen_enc); 6780 } else { 6781 assert(VM_Version::supports_evex(), ""); 6782 if (!xtmp2_hold_M1) { 6783 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6784 } 6785 evpsraq(xtmp1, src, 63, vlen_enc); 6786 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6787 } 6788 } 6789 6790 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6791 int vlen_enc, bool xtmp2_hold_M1) { 6792 if (VM_Version::supports_avx512dq()) { 6793 evpmovd2m(ktmp, src, vlen_enc); 6794 } else { 6795 assert(VM_Version::supports_evex(), ""); 6796 if (!xtmp2_hold_M1) { 6797 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6798 } 6799 vpsrad(xtmp1, src, 31, vlen_enc); 6800 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6801 } 6802 } 6803 6804 6805 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6806 if (elem_bt == T_LONG) { 6807 if (VM_Version::supports_evex()) { 6808 evpsraq(dst, src, 63, vlen_enc); 6809 } else { 6810 vpsrad(dst, src, 31, vlen_enc); 6811 vpshufd(dst, dst, 0xF5, vlen_enc); 6812 } 6813 } else { 6814 assert(elem_bt == T_INT, ""); 6815 vpsrad(dst, src, 31, vlen_enc); 6816 } 6817 } 6818 6819 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6820 if (compute_allones) { 6821 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6822 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6823 } else { 6824 vpcmpeqq(allones, allones, allones, vlen_enc); 6825 } 6826 } 6827 if (elem_bt == T_LONG) { 6828 vpsrlq(dst, allones, 1, vlen_enc); 6829 } else { 6830 assert(elem_bt == T_INT, ""); 6831 vpsrld(dst, allones, 1, vlen_enc); 6832 } 6833 } 6834 6835 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6836 if (compute_allones) { 6837 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6838 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6839 } else { 6840 vpcmpeqq(allones, allones, allones, vlen_enc); 6841 } 6842 } 6843 if (elem_bt == T_LONG) { 6844 vpsllq(dst, allones, 63, vlen_enc); 6845 } else { 6846 assert(elem_bt == T_INT, ""); 6847 vpslld(dst, allones, 31, vlen_enc); 6848 } 6849 } 6850 6851 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6852 Assembler::ComparisonPredicate cond, int vlen_enc) { 6853 switch(elem_bt) { 6854 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6855 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6856 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6857 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6858 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6859 } 6860 } 6861 6862 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6863 switch(elem_bt) { 6864 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6865 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6866 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6867 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6868 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6869 } 6870 } 6871 6872 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6873 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6874 if (elem_bt == T_LONG) { 6875 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6876 } else { 6877 assert(elem_bt == T_INT, ""); 6878 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6879 } 6880 } 6881 6882 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6883 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6884 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6885 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6886 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6887 // Overflow detection based on Hacker's delight section 2-13. 6888 if (ideal_opc == Op_SaturatingAddV) { 6889 // res = src1 + src2 6890 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6891 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6892 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6893 vpxor(xtmp1, dst, src1, vlen_enc); 6894 vpxor(xtmp2, dst, src2, vlen_enc); 6895 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6896 } else { 6897 assert(ideal_opc == Op_SaturatingSubV, ""); 6898 // res = src1 - src2 6899 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6900 // Overflow occurs when both inputs have opposite polarity and 6901 // result polarity does not comply with first input polarity. 6902 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6903 vpxor(xtmp1, src1, src2, vlen_enc); 6904 vpxor(xtmp2, dst, src1, vlen_enc); 6905 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6906 } 6907 6908 // Compute overflow detection mask. 6909 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6910 // Note: xtmp1 hold -1 in all its lanes after above call. 6911 6912 // Compute mask based on first input polarity. 6913 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6914 6915 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6916 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6917 6918 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6919 // set bits in first input polarity mask holds a min value. 6920 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6921 // Blend destination lanes with saturated values using overflow detection mask. 6922 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6923 } 6924 6925 6926 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6927 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6928 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 6929 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6930 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6931 // Overflow detection based on Hacker's delight section 2-13. 6932 if (ideal_opc == Op_SaturatingAddV) { 6933 // res = src1 + src2 6934 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6935 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6936 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6937 vpxor(xtmp1, dst, src1, vlen_enc); 6938 vpxor(xtmp2, dst, src2, vlen_enc); 6939 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6940 } else { 6941 assert(ideal_opc == Op_SaturatingSubV, ""); 6942 // res = src1 - src2 6943 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6944 // Overflow occurs when both inputs have opposite polarity and 6945 // result polarity does not comply with first input polarity. 6946 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6947 vpxor(xtmp1, src1, src2, vlen_enc); 6948 vpxor(xtmp2, dst, src1, vlen_enc); 6949 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6950 } 6951 6952 // Sign-extend to compute overflow detection mask. 6953 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 6954 6955 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 6956 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 6957 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6958 6959 // Compose saturating min/max vector using first input polarity mask. 6960 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 6961 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 6962 6963 // Blend result with saturating vector using overflow detection mask. 6964 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6965 } 6966 6967 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6968 switch(elem_bt) { 6969 case T_BYTE: 6970 if (ideal_opc == Op_SaturatingAddV) { 6971 vpaddsb(dst, src1, src2, vlen_enc); 6972 } else { 6973 assert(ideal_opc == Op_SaturatingSubV, ""); 6974 vpsubsb(dst, src1, src2, vlen_enc); 6975 } 6976 break; 6977 case T_SHORT: 6978 if (ideal_opc == Op_SaturatingAddV) { 6979 vpaddsw(dst, src1, src2, vlen_enc); 6980 } else { 6981 assert(ideal_opc == Op_SaturatingSubV, ""); 6982 vpsubsw(dst, src1, src2, vlen_enc); 6983 } 6984 break; 6985 default: 6986 fatal("Unsupported type %s", type2name(elem_bt)); 6987 break; 6988 } 6989 } 6990 6991 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6992 switch(elem_bt) { 6993 case T_BYTE: 6994 if (ideal_opc == Op_SaturatingAddV) { 6995 vpaddusb(dst, src1, src2, vlen_enc); 6996 } else { 6997 assert(ideal_opc == Op_SaturatingSubV, ""); 6998 vpsubusb(dst, src1, src2, vlen_enc); 6999 } 7000 break; 7001 case T_SHORT: 7002 if (ideal_opc == Op_SaturatingAddV) { 7003 vpaddusw(dst, src1, src2, vlen_enc); 7004 } else { 7005 assert(ideal_opc == Op_SaturatingSubV, ""); 7006 vpsubusw(dst, src1, src2, vlen_enc); 7007 } 7008 break; 7009 default: 7010 fatal("Unsupported type %s", type2name(elem_bt)); 7011 break; 7012 } 7013 } 7014 7015 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 7016 XMMRegister src2, int vlen_enc) { 7017 switch(elem_bt) { 7018 case T_BYTE: 7019 evpermi2b(dst, src1, src2, vlen_enc); 7020 break; 7021 case T_SHORT: 7022 evpermi2w(dst, src1, src2, vlen_enc); 7023 break; 7024 case T_INT: 7025 evpermi2d(dst, src1, src2, vlen_enc); 7026 break; 7027 case T_LONG: 7028 evpermi2q(dst, src1, src2, vlen_enc); 7029 break; 7030 case T_FLOAT: 7031 evpermi2ps(dst, src1, src2, vlen_enc); 7032 break; 7033 case T_DOUBLE: 7034 evpermi2pd(dst, src1, src2, vlen_enc); 7035 break; 7036 default: 7037 fatal("Unsupported type %s", type2name(elem_bt)); 7038 break; 7039 } 7040 } 7041 7042 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 7043 if (is_unsigned) { 7044 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7045 } else { 7046 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7047 } 7048 } 7049 7050 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 7051 if (is_unsigned) { 7052 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7053 } else { 7054 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7055 } 7056 } 7057 7058 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 7059 switch(opcode) { 7060 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7061 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7062 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7063 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7064 default: assert(false, "%s", NodeClassNames[opcode]); break; 7065 } 7066 } 7067 7068 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7069 switch(opcode) { 7070 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7071 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7072 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7073 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7074 default: assert(false, "%s", NodeClassNames[opcode]); break; 7075 } 7076 } 7077 7078 void C2_MacroAssembler::sminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7079 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) { 7080 vminmax_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit); 7081 } 7082 7083 void C2_MacroAssembler::sminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7084 KRegister ktmp) { 7085 if (opcode == Op_MaxHF) { 7086 // dst = max(src1, src2) 7087 evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN); 7088 } else { 7089 assert(opcode == Op_MinHF, ""); 7090 // dst = min(src1, src2) 7091 evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN); 7092 } 7093 } 7094 7095 void C2_MacroAssembler::vminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7096 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 7097 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) { 7098 // Move sign bits of src2 to mask register. 7099 evpmovw2m(ktmp, src2, vlen_enc); 7100 // xtmp1 = src2 < 0 ? src2 : src1 7101 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7102 // xtmp2 = src2 < 0 ? ? src1 : src2 7103 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7104 // Idea behind above swapping is to make seconds source operand a +ve value. 7105 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in 7106 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction, 7107 // the second source operand, either a NaN or a valid floating-point value, is returned 7108 // dst = max(xtmp1, xtmp2) 7109 evmaxph(dst, xtmp1, xtmp2, vlen_enc); 7110 // isNaN = is_unordered_quiet(xtmp1) 7111 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7112 // Final result is same as first source if its a NaN value, 7113 // in case second operand holds a NaN value then as per above semantics 7114 // result is same as second operand. 7115 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7116 } else { 7117 assert(opcode == Op_MinVHF || opcode == Op_MinHF, ""); 7118 // Move sign bits of src1 to mask register. 7119 evpmovw2m(ktmp, src1, vlen_enc); 7120 // xtmp1 = src1 < 0 ? src2 : src1 7121 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7122 // xtmp2 = src1 < 0 ? src1 : src2 7123 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7124 // Idea behind above swapping is to make seconds source operand a -ve value. 7125 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in 7126 // the second source operand is returned. 7127 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN 7128 // or a valid floating-point value, is written to the result. 7129 // dst = min(xtmp1, xtmp2) 7130 evminph(dst, xtmp1, xtmp2, vlen_enc); 7131 // isNaN = is_unordered_quiet(xtmp1) 7132 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7133 // Final result is same as first source if its a NaN value, 7134 // in case second operand holds a NaN value then as per above semantics 7135 // result is same as second operand. 7136 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7137 } 7138 } 7139 7140 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7141 KRegister ktmp, int vlen_enc) { 7142 if (opcode == Op_MaxVHF) { 7143 // dst = max(src1, src2) 7144 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc); 7145 } else { 7146 assert(opcode == Op_MinVHF, ""); 7147 // dst = min(src1, src2) 7148 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc); 7149 } 7150 } 7151 7152 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, Address src2, 7153 KRegister ktmp, int vlen_enc) { 7154 if (opcode == Op_MaxVHF) { 7155 // dst = max(src1, src2) 7156 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc); 7157 } else { 7158 assert(opcode == Op_MinVHF, ""); 7159 // dst = min(src1, src2) 7160 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc); 7161 } 7162 } 7163 7164 int C2_MacroAssembler::vector_iota_entry_index(BasicType bt) { 7165 // The vector iota entries array is ordered by type B/S/I/L/F/D, and 7166 // the offset between two types is 16. 7167 switch(bt) { 7168 case T_BYTE: 7169 return 0; 7170 case T_SHORT: 7171 return 1; 7172 case T_INT: 7173 return 2; 7174 case T_LONG: 7175 return 3; 7176 case T_FLOAT: 7177 return 4; 7178 case T_DOUBLE: 7179 return 5; 7180 default: 7181 ShouldNotReachHere(); 7182 } 7183 } --- EOF ---