1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/methodData.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/opcodes.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/globals.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 #include "utilities/checkedCast.hpp" 39 #include "utilities/globalDefinitions.hpp" 40 #include "utilities/powerOfTwo.hpp" 41 #include "utilities/sizes.hpp" 42 43 #ifdef PRODUCT 44 #define BLOCK_COMMENT(str) /* nothing */ 45 #define STOP(error) stop(error) 46 #else 47 #define BLOCK_COMMENT(str) block_comment(str) 48 #define STOP(error) block_comment(error); stop(error) 49 #endif 50 51 // C2 compiled method's prolog code. 52 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) { 53 if (C->clinit_barrier_on_entry()) { 54 assert(VM_Version::supports_fast_class_init_checks(), "sanity"); 55 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started"); 56 57 Label L_skip_barrier; 58 Register klass = rscratch1; 59 60 mov_metadata(klass, C->method()->holder()->constant_encoding()); 61 clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/); 62 63 jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 64 65 bind(L_skip_barrier); 66 } 67 68 int framesize = C->output()->frame_size_in_bytes(); 69 int bangsize = C->output()->bang_size_in_bytes(); 70 bool fp_mode_24b = false; 71 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0; 72 73 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 74 75 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 76 // Remove word for return addr 77 framesize -= wordSize; 78 stack_bang_size -= wordSize; 79 80 // Calls to C2R adapters often do not accept exceptional returns. 81 // We require that their callers must bang for them. But be careful, because 82 // some VM calls (such as call site linkage) can use several kilobytes of 83 // stack. But the stack safety zone should account for that. 84 // See bugs 4446381, 4468289, 4497237. 85 if (stack_bang_size > 0) { 86 generate_stack_overflow_check(stack_bang_size); 87 88 // We always push rbp, so that on return to interpreter rbp, will be 89 // restored correctly and we can correct the stack. 90 push(rbp); 91 // Save caller's stack pointer into RBP if the frame pointer is preserved. 92 if (PreserveFramePointer) { 93 mov(rbp, rsp); 94 } 95 // Remove word for ebp 96 framesize -= wordSize; 97 98 // Create frame 99 if (framesize) { 100 subptr(rsp, framesize); 101 } 102 } else { 103 subptr(rsp, framesize); 104 105 // Save RBP register now. 106 framesize -= wordSize; 107 movptr(Address(rsp, framesize), rbp); 108 // Save caller's stack pointer into RBP if the frame pointer is preserved. 109 if (PreserveFramePointer) { 110 movptr(rbp, rsp); 111 if (framesize > 0) { 112 addptr(rbp, framesize); 113 } 114 } 115 } 116 117 if (C->needs_stack_repair()) { 118 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp) 119 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned"); 120 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize); 121 } 122 123 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 124 framesize -= wordSize; 125 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 126 } 127 128 #ifdef ASSERT 129 if (VerifyStackAtCalls) { 130 Label L; 131 push(rax); 132 mov(rax, rsp); 133 andptr(rax, StackAlignmentInBytes-1); 134 cmpptr(rax, StackAlignmentInBytes-wordSize); 135 pop(rax); 136 jcc(Assembler::equal, L); 137 STOP("Stack is not properly aligned!"); 138 bind(L); 139 } 140 #endif 141 } 142 143 void C2_MacroAssembler::entry_barrier() { 144 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 145 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 146 Label dummy_slow_path; 147 Label dummy_continuation; 148 Label* slow_path = &dummy_slow_path; 149 Label* continuation = &dummy_continuation; 150 if (!Compile::current()->output()->in_scratch_emit_size()) { 151 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 152 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 153 Compile::current()->output()->add_stub(stub); 154 slow_path = &stub->entry(); 155 continuation = &stub->continuation(); 156 } 157 bs->nmethod_entry_barrier(this, slow_path, continuation); 158 } 159 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 161 switch (vlen_in_bytes) { 162 case 4: // fall-through 163 case 8: // fall-through 164 case 16: return Assembler::AVX_128bit; 165 case 32: return Assembler::AVX_256bit; 166 case 64: return Assembler::AVX_512bit; 167 168 default: { 169 ShouldNotReachHere(); 170 return Assembler::AVX_NoVec; 171 } 172 } 173 } 174 175 // fast_lock and fast_unlock used by C2 176 177 // Because the transitions from emitted code to the runtime 178 // monitorenter/exit helper stubs are so slow it's critical that 179 // we inline both the stack-locking fast path and the inflated fast path. 180 // 181 // See also: cmpFastLock and cmpFastUnlock. 182 // 183 // What follows is a specialized inline transliteration of the code 184 // in enter() and exit(). If we're concerned about I$ bloat another 185 // option would be to emit TrySlowEnter and TrySlowExit methods 186 // at startup-time. These methods would accept arguments as 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 188 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 190 // In practice, however, the # of lock sites is bounded and is usually small. 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 192 // if the processor uses simple bimodal branch predictors keyed by EIP 193 // Since the helper routines would be called from multiple synchronization 194 // sites. 195 // 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 198 // to those specialized methods. That'd give us a mostly platform-independent 199 // implementation that the JITs could optimize and inline at their pleasure. 200 // Done correctly, the only time we'd need to cross to native could would be 201 // to park() or unpark() threads. We'd also need a few more unsafe operators 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 203 // (b) explicit barriers or fence operations. 204 // 205 // TODO: 206 // 207 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 208 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 209 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 210 // the lock operators would typically be faster than reifying Self. 211 // 212 // * Ideally I'd define the primitives as: 213 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 214 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 215 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 216 // Instead, we're stuck with a rather awkward and brittle register assignments below. 217 // Furthermore the register assignments are overconstrained, possibly resulting in 218 // sub-optimal code near the synchronization site. 219 // 220 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 221 // Alternately, use a better sp-proximity test. 222 // 223 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 224 // Either one is sufficient to uniquely identify a thread. 225 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 226 // 227 // * Intrinsify notify() and notifyAll() for the common cases where the 228 // object is locked by the calling thread but the waitlist is empty. 229 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 230 // 231 // * use jccb and jmpb instead of jcc and jmp to improve code density. 232 // But beware of excessive branch density on AMD Opterons. 233 // 234 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 235 // or failure of the fast path. If the fast path fails then we pass 236 // control to the slow path, typically in C. In fast_lock and 237 // fast_unlock we often branch to DONE_LABEL, just to find that C2 238 // will emit a conditional branch immediately after the node. 239 // So we have branches to branches and lots of ICC.ZF games. 240 // Instead, it might be better to have C2 pass a "FailureLabel" 241 // into fast_lock and fast_unlock. In the case of success, control 242 // will drop through the node. ICC.ZF is undefined at exit. 243 // In the case of failure, the node will branch directly to the 244 // FailureLabel 245 246 247 // obj: object to lock 248 // box: on-stack box address -- KILLED 249 // rax: tmp -- KILLED 250 // t : tmp -- KILLED 251 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 252 Register t, Register thread) { 253 assert(rax_reg == rax, "Used for CAS"); 254 assert_different_registers(obj, box, rax_reg, t, thread); 255 256 // Handle inflated monitor. 257 Label inflated; 258 // Finish fast lock successfully. ZF value is irrelevant. 259 Label locked; 260 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 261 Label slow_path; 262 263 if (UseObjectMonitorTable) { 264 // Clear cache in case fast locking succeeds or we need to take the slow-path. 265 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 266 } 267 268 if (DiagnoseSyncOnValueBasedClasses != 0) { 269 load_klass(rax_reg, obj, t); 270 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 271 jcc(Assembler::notZero, slow_path); 272 } 273 274 const Register mark = t; 275 276 { // Lightweight Lock 277 278 Label push; 279 280 const Register top = UseObjectMonitorTable ? rax_reg : box; 281 282 // Load the mark. 283 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 284 285 // Prefetch top. 286 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 287 288 // Check for monitor (0b10). 289 testptr(mark, markWord::monitor_value); 290 jcc(Assembler::notZero, inflated); 291 292 // Check if lock-stack is full. 293 cmpl(top, LockStack::end_offset() - 1); 294 jcc(Assembler::greater, slow_path); 295 296 // Check if recursive. 297 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 298 jccb(Assembler::equal, push); 299 300 // Try to lock. Transition lock bits 0b01 => 0b00 301 movptr(rax_reg, mark); 302 orptr(rax_reg, markWord::unlocked_value); 303 andptr(mark, ~(int32_t)markWord::unlocked_value); 304 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 305 jcc(Assembler::notEqual, slow_path); 306 307 if (UseObjectMonitorTable) { 308 // Need to reload top, clobbered by CAS. 309 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 310 } 311 bind(push); 312 // After successful lock, push object on lock-stack. 313 movptr(Address(thread, top), obj); 314 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 315 jmpb(locked); 316 } 317 318 { // Handle inflated monitor. 319 bind(inflated); 320 321 const Register monitor = t; 322 323 if (!UseObjectMonitorTable) { 324 assert(mark == monitor, "should be the same here"); 325 } else { 326 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 327 // Fetch ObjectMonitor* from the cache or take the slow-path. 328 Label monitor_found; 329 330 // Load cache address 331 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 332 333 const int num_unrolled = 2; 334 for (int i = 0; i < num_unrolled; i++) { 335 cmpptr(obj, Address(t)); 336 jccb(Assembler::equal, monitor_found); 337 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 338 } 339 340 Label loop; 341 342 // Search for obj in cache. 343 bind(loop); 344 345 // Check for match. 346 cmpptr(obj, Address(t)); 347 jccb(Assembler::equal, monitor_found); 348 349 // Search until null encountered, guaranteed _null_sentinel at end. 350 cmpptr(Address(t), 1); 351 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 352 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 353 jmpb(loop); 354 355 // Cache hit. 356 bind(monitor_found); 357 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 358 } 359 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 360 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 361 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 362 363 Label monitor_locked; 364 // Lock the monitor. 365 366 if (UseObjectMonitorTable) { 367 // Cache the monitor for unlock before trashing box. On failure to acquire 368 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 369 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 370 } 371 372 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 373 xorptr(rax_reg, rax_reg); 374 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset())); 375 lock(); cmpxchgptr(box, owner_address); 376 jccb(Assembler::equal, monitor_locked); 377 378 // Check if recursive. 379 cmpptr(box, rax_reg); 380 jccb(Assembler::notEqual, slow_path); 381 382 // Recursive. 383 increment(recursions_address); 384 385 bind(monitor_locked); 386 } 387 388 bind(locked); 389 // Set ZF = 1 390 xorl(rax_reg, rax_reg); 391 392 #ifdef ASSERT 393 // Check that locked label is reached with ZF set. 394 Label zf_correct; 395 Label zf_bad_zero; 396 jcc(Assembler::zero, zf_correct); 397 jmp(zf_bad_zero); 398 #endif 399 400 bind(slow_path); 401 #ifdef ASSERT 402 // Check that slow_path label is reached with ZF not set. 403 jcc(Assembler::notZero, zf_correct); 404 stop("Fast Lock ZF != 0"); 405 bind(zf_bad_zero); 406 stop("Fast Lock ZF != 1"); 407 bind(zf_correct); 408 #endif 409 // C2 uses the value of ZF to determine the continuation. 410 } 411 412 // obj: object to lock 413 // rax: tmp -- KILLED 414 // t : tmp - cannot be obj nor rax -- KILLED 415 // 416 // Some commentary on balanced locking: 417 // 418 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 419 // Methods that don't have provably balanced locking are forced to run in the 420 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 421 // The interpreter provides two properties: 422 // I1: At return-time the interpreter automatically and quietly unlocks any 423 // objects acquired in the current activation (frame). Recall that the 424 // interpreter maintains an on-stack list of locks currently held by 425 // a frame. 426 // I2: If a method attempts to unlock an object that is not held by the 427 // frame the interpreter throws IMSX. 428 // 429 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 430 // B() doesn't have provably balanced locking so it runs in the interpreter. 431 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 432 // is still locked by A(). 433 // 434 // The only other source of unbalanced locking would be JNI. The "Java Native Interface 435 // Specification" states that an object locked by JNI's MonitorEnter should not be 436 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't 437 // specify what will occur if a program engages in such mixed-mode locking, however. 438 // Arguably given that the spec legislates the JNI case as undefined our implementation 439 // could reasonably *avoid* checking owner in fast_unlock(). 440 // In the interest of performance we elide m->Owner==Self check in unlock. 441 // A perfectly viable alternative is to elide the owner check except when 442 // Xcheck:jni is enabled. 443 444 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 445 assert(reg_rax == rax, "Used for CAS"); 446 assert_different_registers(obj, reg_rax, t); 447 448 // Handle inflated monitor. 449 Label inflated, inflated_check_lock_stack; 450 // Finish fast unlock successfully. MUST jump with ZF == 1 451 Label unlocked, slow_path; 452 453 const Register mark = t; 454 const Register monitor = t; 455 const Register top = UseObjectMonitorTable ? t : reg_rax; 456 const Register box = reg_rax; 457 458 Label dummy; 459 C2FastUnlockLightweightStub* stub = nullptr; 460 461 if (!Compile::current()->output()->in_scratch_emit_size()) { 462 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 463 Compile::current()->output()->add_stub(stub); 464 } 465 466 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 467 468 { // Lightweight Unlock 469 470 // Load top. 471 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 472 473 if (!UseObjectMonitorTable) { 474 // Prefetch mark. 475 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 476 } 477 478 // Check if obj is top of lock-stack. 479 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 480 // Top of lock stack was not obj. Must be monitor. 481 jcc(Assembler::notEqual, inflated_check_lock_stack); 482 483 // Pop lock-stack. 484 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 485 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 486 487 // Check if recursive. 488 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 489 jcc(Assembler::equal, unlocked); 490 491 // We elide the monitor check, let the CAS fail instead. 492 493 if (UseObjectMonitorTable) { 494 // Load mark. 495 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 496 } 497 498 // Try to unlock. Transition lock bits 0b00 => 0b01 499 movptr(reg_rax, mark); 500 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 501 orptr(mark, markWord::unlocked_value); 502 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 503 jcc(Assembler::notEqual, push_and_slow_path); 504 jmp(unlocked); 505 } 506 507 508 { // Handle inflated monitor. 509 bind(inflated_check_lock_stack); 510 #ifdef ASSERT 511 Label check_done; 512 subl(top, oopSize); 513 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 514 jcc(Assembler::below, check_done); 515 cmpptr(obj, Address(thread, top)); 516 jccb(Assembler::notEqual, inflated_check_lock_stack); 517 stop("Fast Unlock lock on stack"); 518 bind(check_done); 519 if (UseObjectMonitorTable) { 520 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 521 } 522 testptr(mark, markWord::monitor_value); 523 jccb(Assembler::notZero, inflated); 524 stop("Fast Unlock not monitor"); 525 #endif 526 527 bind(inflated); 528 529 if (!UseObjectMonitorTable) { 530 assert(mark == monitor, "should be the same here"); 531 } else { 532 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 533 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 534 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 535 cmpptr(monitor, alignof(ObjectMonitor*)); 536 jcc(Assembler::below, slow_path); 537 } 538 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 539 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 540 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 541 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag}; 542 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 543 544 Label recursive; 545 546 // Check if recursive. 547 cmpptr(recursions_address, 0); 548 jccb(Assembler::notZero, recursive); 549 550 // Set owner to null. 551 // Release to satisfy the JMM 552 movptr(owner_address, NULL_WORD); 553 // We need a full fence after clearing owner to avoid stranding. 554 // StoreLoad achieves this. 555 membar(StoreLoad); 556 557 // Check if the entry_list is empty. 558 cmpptr(entry_list_address, NULL_WORD); 559 jccb(Assembler::zero, unlocked); // If so we are done. 560 561 // Check if there is a successor. 562 cmpptr(succ_address, NULL_WORD); 563 jccb(Assembler::notZero, unlocked); // If so we are done. 564 565 // Save the monitor pointer in the current thread, so we can try to 566 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 567 if (!UseObjectMonitorTable) { 568 andptr(monitor, ~(int32_t)markWord::monitor_value); 569 } 570 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 571 572 orl(t, 1); // Fast Unlock ZF = 0 573 jmpb(slow_path); 574 575 // Recursive unlock. 576 bind(recursive); 577 decrement(recursions_address); 578 } 579 580 bind(unlocked); 581 xorl(t, t); // Fast Unlock ZF = 1 582 583 #ifdef ASSERT 584 // Check that unlocked label is reached with ZF set. 585 Label zf_correct; 586 Label zf_bad_zero; 587 jcc(Assembler::zero, zf_correct); 588 jmp(zf_bad_zero); 589 #endif 590 591 bind(slow_path); 592 if (stub != nullptr) { 593 bind(stub->slow_path_continuation()); 594 } 595 #ifdef ASSERT 596 // Check that stub->continuation() label is reached with ZF not set. 597 jcc(Assembler::notZero, zf_correct); 598 stop("Fast Unlock ZF != 0"); 599 bind(zf_bad_zero); 600 stop("Fast Unlock ZF != 1"); 601 bind(zf_correct); 602 #endif 603 // C2 uses the value of ZF to determine the continuation. 604 } 605 606 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) { 607 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi); 608 } 609 610 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) { 611 const int framesize = Compile::current()->output()->frame_size_in_bytes(); 612 masm->movptr(dst, rsp); 613 if (framesize > 2 * wordSize) { 614 masm->addptr(dst, framesize - 2 * wordSize); 615 } 616 } 617 618 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) { 619 if (PreserveFramePointer) { 620 // frame pointer is valid 621 #ifdef ASSERT 622 // Verify frame pointer value in rbp. 623 reconstruct_frame_pointer_helper(this, rtmp); 624 Label L_success; 625 cmpq(rbp, rtmp); 626 jccb(Assembler::equal, L_success); 627 STOP("frame pointer mismatch"); 628 bind(L_success); 629 #endif // ASSERT 630 } else { 631 reconstruct_frame_pointer_helper(this, rbp); 632 } 633 } 634 635 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) { 636 jint lo = t->_lo; 637 jint hi = t->_hi; 638 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi); 639 if (t == TypeInt::INT) { 640 return; 641 } 642 643 BLOCK_COMMENT("CastII {"); 644 Label fail; 645 Label succeed; 646 if (hi == max_jint) { 647 cmpl(val, lo); 648 jccb(Assembler::greaterEqual, succeed); 649 } else { 650 if (lo != min_jint) { 651 cmpl(val, lo); 652 jccb(Assembler::less, fail); 653 } 654 cmpl(val, hi); 655 jccb(Assembler::lessEqual, succeed); 656 } 657 658 bind(fail); 659 movl(c_rarg0, idx); 660 movl(c_rarg1, val); 661 movl(c_rarg2, lo); 662 movl(c_rarg3, hi); 663 reconstruct_frame_pointer(rscratch1); 664 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range))); 665 hlt(); 666 bind(succeed); 667 BLOCK_COMMENT("} // CastII"); 668 } 669 670 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) { 671 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi); 672 } 673 674 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) { 675 jlong lo = t->_lo; 676 jlong hi = t->_hi; 677 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi); 678 if (t == TypeLong::LONG) { 679 return; 680 } 681 682 BLOCK_COMMENT("CastLL {"); 683 Label fail; 684 Label succeed; 685 686 auto cmp_val = [&](jlong bound) { 687 if (is_simm32(bound)) { 688 cmpq(val, checked_cast<int>(bound)); 689 } else { 690 mov64(tmp, bound); 691 cmpq(val, tmp); 692 } 693 }; 694 695 if (hi == max_jlong) { 696 cmp_val(lo); 697 jccb(Assembler::greaterEqual, succeed); 698 } else { 699 if (lo != min_jlong) { 700 cmp_val(lo); 701 jccb(Assembler::less, fail); 702 } 703 cmp_val(hi); 704 jccb(Assembler::lessEqual, succeed); 705 } 706 707 bind(fail); 708 movl(c_rarg0, idx); 709 movq(c_rarg1, val); 710 mov64(c_rarg2, lo); 711 mov64(c_rarg3, hi); 712 reconstruct_frame_pointer(rscratch1); 713 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range))); 714 hlt(); 715 bind(succeed); 716 BLOCK_COMMENT("} // CastLL"); 717 } 718 719 //------------------------------------------------------------------------------------------- 720 // Generic instructions support for use in .ad files C2 code generation 721 722 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 723 if (dst != src) { 724 movdqu(dst, src); 725 } 726 if (opcode == Op_AbsVD) { 727 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 728 } else { 729 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 730 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 731 } 732 } 733 734 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 735 if (opcode == Op_AbsVD) { 736 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 737 } else { 738 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 739 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 740 } 741 } 742 743 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 744 if (dst != src) { 745 movdqu(dst, src); 746 } 747 if (opcode == Op_AbsVF) { 748 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 749 } else { 750 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 751 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 752 } 753 } 754 755 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 756 if (opcode == Op_AbsVF) { 757 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 758 } else { 759 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 760 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 761 } 762 } 763 764 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 765 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 766 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 767 768 if (opcode == Op_MinV) { 769 if (elem_bt == T_BYTE) { 770 pminsb(dst, src); 771 } else if (elem_bt == T_SHORT) { 772 pminsw(dst, src); 773 } else if (elem_bt == T_INT) { 774 pminsd(dst, src); 775 } else { 776 assert(elem_bt == T_LONG, "required"); 777 assert(tmp == xmm0, "required"); 778 assert_different_registers(dst, src, tmp); 779 movdqu(xmm0, dst); 780 pcmpgtq(xmm0, src); 781 blendvpd(dst, src); // xmm0 as mask 782 } 783 } else { // opcode == Op_MaxV 784 if (elem_bt == T_BYTE) { 785 pmaxsb(dst, src); 786 } else if (elem_bt == T_SHORT) { 787 pmaxsw(dst, src); 788 } else if (elem_bt == T_INT) { 789 pmaxsd(dst, src); 790 } else { 791 assert(elem_bt == T_LONG, "required"); 792 assert(tmp == xmm0, "required"); 793 assert_different_registers(dst, src, tmp); 794 movdqu(xmm0, src); 795 pcmpgtq(xmm0, dst); 796 blendvpd(dst, src); // xmm0 as mask 797 } 798 } 799 } 800 801 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 802 XMMRegister src1, Address src2, int vlen_enc) { 803 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 804 if (opcode == Op_UMinV) { 805 switch(elem_bt) { 806 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 807 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 808 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 809 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 810 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 811 } 812 } else { 813 assert(opcode == Op_UMaxV, "required"); 814 switch(elem_bt) { 815 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 816 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 817 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 818 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 819 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 820 } 821 } 822 } 823 824 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 825 // For optimality, leverage a full vector width of 512 bits 826 // for operations over smaller vector sizes on AVX512 targets. 827 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 828 if (opcode == Op_UMaxV) { 829 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 830 } else { 831 assert(opcode == Op_UMinV, "required"); 832 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 833 } 834 } else { 835 // T1 = -1 836 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 837 // T1 = -1 << 63 838 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 839 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 840 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 841 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 842 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 843 // Mask = T2 > T1 844 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 845 if (opcode == Op_UMaxV) { 846 // Res = Mask ? Src2 : Src1 847 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 848 } else { 849 // Res = Mask ? Src1 : Src2 850 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 851 } 852 } 853 } 854 855 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 856 XMMRegister src1, XMMRegister src2, int vlen_enc) { 857 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 858 if (opcode == Op_UMinV) { 859 switch(elem_bt) { 860 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 861 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 862 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 863 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 864 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 865 } 866 } else { 867 assert(opcode == Op_UMaxV, "required"); 868 switch(elem_bt) { 869 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 870 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 871 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 872 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 873 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 874 } 875 } 876 } 877 878 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 879 XMMRegister dst, XMMRegister src1, XMMRegister src2, 880 int vlen_enc) { 881 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 882 883 if (opcode == Op_MinV) { 884 if (elem_bt == T_BYTE) { 885 vpminsb(dst, src1, src2, vlen_enc); 886 } else if (elem_bt == T_SHORT) { 887 vpminsw(dst, src1, src2, vlen_enc); 888 } else if (elem_bt == T_INT) { 889 vpminsd(dst, src1, src2, vlen_enc); 890 } else { 891 assert(elem_bt == T_LONG, "required"); 892 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 893 vpminsq(dst, src1, src2, vlen_enc); 894 } else { 895 assert_different_registers(dst, src1, src2); 896 vpcmpgtq(dst, src1, src2, vlen_enc); 897 vblendvpd(dst, src1, src2, dst, vlen_enc); 898 } 899 } 900 } else { // opcode == Op_MaxV 901 if (elem_bt == T_BYTE) { 902 vpmaxsb(dst, src1, src2, vlen_enc); 903 } else if (elem_bt == T_SHORT) { 904 vpmaxsw(dst, src1, src2, vlen_enc); 905 } else if (elem_bt == T_INT) { 906 vpmaxsd(dst, src1, src2, vlen_enc); 907 } else { 908 assert(elem_bt == T_LONG, "required"); 909 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 910 vpmaxsq(dst, src1, src2, vlen_enc); 911 } else { 912 assert_different_registers(dst, src1, src2); 913 vpcmpgtq(dst, src1, src2, vlen_enc); 914 vblendvpd(dst, src2, src1, dst, vlen_enc); 915 } 916 } 917 } 918 } 919 920 // Float/Double min max 921 922 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 923 XMMRegister dst, XMMRegister a, XMMRegister b, 924 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 925 int vlen_enc) { 926 assert(UseAVX > 0, "required"); 927 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 928 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 929 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 930 assert_different_registers(a, tmp, atmp, btmp); 931 assert_different_registers(b, tmp, atmp, btmp); 932 933 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 934 bool is_double_word = is_double_word_type(elem_bt); 935 936 /* Note on 'non-obvious' assembly sequence: 937 * 938 * While there are vminps/vmaxps instructions, there are two important differences between hardware 939 * and Java on how they handle floats: 940 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 941 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 942 * 943 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 944 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 945 * (only useful when signs differ, noop otherwise) 946 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 947 948 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 949 * btmp = (b < +0.0) ? a : b 950 * atmp = (b < +0.0) ? b : a 951 * Tmp = Max_Float(atmp , btmp) 952 * Res = (atmp == NaN) ? atmp : Tmp 953 */ 954 955 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 956 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 957 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 958 XMMRegister mask; 959 960 if (!is_double_word && is_min) { 961 mask = a; 962 vblend = &MacroAssembler::vblendvps; 963 vmaxmin = &MacroAssembler::vminps; 964 vcmp = &MacroAssembler::vcmpps; 965 } else if (!is_double_word && !is_min) { 966 mask = b; 967 vblend = &MacroAssembler::vblendvps; 968 vmaxmin = &MacroAssembler::vmaxps; 969 vcmp = &MacroAssembler::vcmpps; 970 } else if (is_double_word && is_min) { 971 mask = a; 972 vblend = &MacroAssembler::vblendvpd; 973 vmaxmin = &MacroAssembler::vminpd; 974 vcmp = &MacroAssembler::vcmppd; 975 } else { 976 assert(is_double_word && !is_min, "sanity"); 977 mask = b; 978 vblend = &MacroAssembler::vblendvpd; 979 vmaxmin = &MacroAssembler::vmaxpd; 980 vcmp = &MacroAssembler::vcmppd; 981 } 982 983 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 984 XMMRegister maxmin, scratch; 985 if (dst == btmp) { 986 maxmin = btmp; 987 scratch = tmp; 988 } else { 989 maxmin = tmp; 990 scratch = btmp; 991 } 992 993 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 994 if (precompute_mask && !is_double_word) { 995 vpsrad(tmp, mask, 32, vlen_enc); 996 mask = tmp; 997 } else if (precompute_mask && is_double_word) { 998 vpxor(tmp, tmp, tmp, vlen_enc); 999 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1000 mask = tmp; 1001 } 1002 1003 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1004 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1005 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1006 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1007 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1008 } 1009 1010 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1011 XMMRegister dst, XMMRegister a, XMMRegister b, 1012 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1013 int vlen_enc) { 1014 assert(UseAVX > 2, "required"); 1015 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1016 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1017 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1018 assert_different_registers(dst, a, atmp, btmp); 1019 assert_different_registers(dst, b, atmp, btmp); 1020 1021 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1022 bool is_double_word = is_double_word_type(elem_bt); 1023 bool merge = true; 1024 1025 if (!is_double_word && is_min) { 1026 evpmovd2m(ktmp, a, vlen_enc); 1027 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1028 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1029 vminps(dst, atmp, btmp, vlen_enc); 1030 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1031 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1032 } else if (!is_double_word && !is_min) { 1033 evpmovd2m(ktmp, b, vlen_enc); 1034 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1035 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1036 vmaxps(dst, atmp, btmp, vlen_enc); 1037 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1038 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1039 } else if (is_double_word && is_min) { 1040 evpmovq2m(ktmp, a, vlen_enc); 1041 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1042 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1043 vminpd(dst, atmp, btmp, vlen_enc); 1044 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1045 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1046 } else { 1047 assert(is_double_word && !is_min, "sanity"); 1048 evpmovq2m(ktmp, b, vlen_enc); 1049 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1050 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1051 vmaxpd(dst, atmp, btmp, vlen_enc); 1052 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1053 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1054 } 1055 } 1056 1057 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask, 1058 XMMRegister src1, XMMRegister src2, int vlen_enc) { 1059 assert(opc == Op_MinV || opc == Op_MinReductionV || 1060 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity"); 1061 1062 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_MINMAX_MIN_COMPARE_SIGN 1063 : AVX10_MINMAX_MAX_COMPARE_SIGN; 1064 if (elem_bt == T_FLOAT) { 1065 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc); 1066 } else { 1067 assert(elem_bt == T_DOUBLE, ""); 1068 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc); 1069 } 1070 } 1071 1072 // Float/Double signum 1073 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1074 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1075 1076 Label DONE_LABEL; 1077 1078 if (opcode == Op_SignumF) { 1079 ucomiss(dst, zero); 1080 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1081 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1082 movflt(dst, one); 1083 jcc(Assembler::above, DONE_LABEL); 1084 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1085 } else if (opcode == Op_SignumD) { 1086 ucomisd(dst, zero); 1087 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1088 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1089 movdbl(dst, one); 1090 jcc(Assembler::above, DONE_LABEL); 1091 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1092 } 1093 1094 bind(DONE_LABEL); 1095 } 1096 1097 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1098 if (sign) { 1099 pmovsxbw(dst, src); 1100 } else { 1101 pmovzxbw(dst, src); 1102 } 1103 } 1104 1105 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1106 if (sign) { 1107 vpmovsxbw(dst, src, vector_len); 1108 } else { 1109 vpmovzxbw(dst, src, vector_len); 1110 } 1111 } 1112 1113 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1114 if (sign) { 1115 vpmovsxbd(dst, src, vector_len); 1116 } else { 1117 vpmovzxbd(dst, src, vector_len); 1118 } 1119 } 1120 1121 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1122 if (sign) { 1123 vpmovsxwd(dst, src, vector_len); 1124 } else { 1125 vpmovzxwd(dst, src, vector_len); 1126 } 1127 } 1128 1129 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1130 int shift, int vector_len) { 1131 if (opcode == Op_RotateLeftV) { 1132 if (etype == T_INT) { 1133 evprold(dst, src, shift, vector_len); 1134 } else { 1135 assert(etype == T_LONG, "expected type T_LONG"); 1136 evprolq(dst, src, shift, vector_len); 1137 } 1138 } else { 1139 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1140 if (etype == T_INT) { 1141 evprord(dst, src, shift, vector_len); 1142 } else { 1143 assert(etype == T_LONG, "expected type T_LONG"); 1144 evprorq(dst, src, shift, vector_len); 1145 } 1146 } 1147 } 1148 1149 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1150 XMMRegister shift, int vector_len) { 1151 if (opcode == Op_RotateLeftV) { 1152 if (etype == T_INT) { 1153 evprolvd(dst, src, shift, vector_len); 1154 } else { 1155 assert(etype == T_LONG, "expected type T_LONG"); 1156 evprolvq(dst, src, shift, vector_len); 1157 } 1158 } else { 1159 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1160 if (etype == T_INT) { 1161 evprorvd(dst, src, shift, vector_len); 1162 } else { 1163 assert(etype == T_LONG, "expected type T_LONG"); 1164 evprorvq(dst, src, shift, vector_len); 1165 } 1166 } 1167 } 1168 1169 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1170 if (opcode == Op_RShiftVI) { 1171 psrad(dst, shift); 1172 } else if (opcode == Op_LShiftVI) { 1173 pslld(dst, shift); 1174 } else { 1175 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1176 psrld(dst, shift); 1177 } 1178 } 1179 1180 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1181 switch (opcode) { 1182 case Op_RShiftVI: psrad(dst, shift); break; 1183 case Op_LShiftVI: pslld(dst, shift); break; 1184 case Op_URShiftVI: psrld(dst, shift); break; 1185 1186 default: assert(false, "%s", NodeClassNames[opcode]); 1187 } 1188 } 1189 1190 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1191 if (opcode == Op_RShiftVI) { 1192 vpsrad(dst, nds, shift, vector_len); 1193 } else if (opcode == Op_LShiftVI) { 1194 vpslld(dst, nds, shift, vector_len); 1195 } else { 1196 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1197 vpsrld(dst, nds, shift, vector_len); 1198 } 1199 } 1200 1201 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1202 switch (opcode) { 1203 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1204 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1205 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1206 1207 default: assert(false, "%s", NodeClassNames[opcode]); 1208 } 1209 } 1210 1211 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1212 switch (opcode) { 1213 case Op_RShiftVB: // fall-through 1214 case Op_RShiftVS: psraw(dst, shift); break; 1215 1216 case Op_LShiftVB: // fall-through 1217 case Op_LShiftVS: psllw(dst, shift); break; 1218 1219 case Op_URShiftVS: // fall-through 1220 case Op_URShiftVB: psrlw(dst, shift); break; 1221 1222 default: assert(false, "%s", NodeClassNames[opcode]); 1223 } 1224 } 1225 1226 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1227 switch (opcode) { 1228 case Op_RShiftVB: // fall-through 1229 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1230 1231 case Op_LShiftVB: // fall-through 1232 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1233 1234 case Op_URShiftVS: // fall-through 1235 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1236 1237 default: assert(false, "%s", NodeClassNames[opcode]); 1238 } 1239 } 1240 1241 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1242 switch (opcode) { 1243 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1244 case Op_LShiftVL: psllq(dst, shift); break; 1245 case Op_URShiftVL: psrlq(dst, shift); break; 1246 1247 default: assert(false, "%s", NodeClassNames[opcode]); 1248 } 1249 } 1250 1251 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1252 if (opcode == Op_RShiftVL) { 1253 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1254 } else if (opcode == Op_LShiftVL) { 1255 psllq(dst, shift); 1256 } else { 1257 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1258 psrlq(dst, shift); 1259 } 1260 } 1261 1262 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1263 switch (opcode) { 1264 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1265 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1266 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1267 1268 default: assert(false, "%s", NodeClassNames[opcode]); 1269 } 1270 } 1271 1272 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1273 if (opcode == Op_RShiftVL) { 1274 evpsraq(dst, nds, shift, vector_len); 1275 } else if (opcode == Op_LShiftVL) { 1276 vpsllq(dst, nds, shift, vector_len); 1277 } else { 1278 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1279 vpsrlq(dst, nds, shift, vector_len); 1280 } 1281 } 1282 1283 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1284 switch (opcode) { 1285 case Op_RShiftVB: // fall-through 1286 case Op_RShiftVS: // fall-through 1287 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1288 1289 case Op_LShiftVB: // fall-through 1290 case Op_LShiftVS: // fall-through 1291 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1292 1293 case Op_URShiftVB: // fall-through 1294 case Op_URShiftVS: // fall-through 1295 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1296 1297 default: assert(false, "%s", NodeClassNames[opcode]); 1298 } 1299 } 1300 1301 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1302 switch (opcode) { 1303 case Op_RShiftVB: // fall-through 1304 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1305 1306 case Op_LShiftVB: // fall-through 1307 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1308 1309 case Op_URShiftVB: // fall-through 1310 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1311 1312 default: assert(false, "%s", NodeClassNames[opcode]); 1313 } 1314 } 1315 1316 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1317 assert(UseAVX >= 2, "required"); 1318 switch (opcode) { 1319 case Op_RShiftVL: { 1320 if (UseAVX > 2) { 1321 assert(tmp == xnoreg, "not used"); 1322 if (!VM_Version::supports_avx512vl()) { 1323 vlen_enc = Assembler::AVX_512bit; 1324 } 1325 evpsravq(dst, src, shift, vlen_enc); 1326 } else { 1327 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1328 vpsrlvq(dst, src, shift, vlen_enc); 1329 vpsrlvq(tmp, tmp, shift, vlen_enc); 1330 vpxor(dst, dst, tmp, vlen_enc); 1331 vpsubq(dst, dst, tmp, vlen_enc); 1332 } 1333 break; 1334 } 1335 case Op_LShiftVL: { 1336 assert(tmp == xnoreg, "not used"); 1337 vpsllvq(dst, src, shift, vlen_enc); 1338 break; 1339 } 1340 case Op_URShiftVL: { 1341 assert(tmp == xnoreg, "not used"); 1342 vpsrlvq(dst, src, shift, vlen_enc); 1343 break; 1344 } 1345 default: assert(false, "%s", NodeClassNames[opcode]); 1346 } 1347 } 1348 1349 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1350 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1351 assert(opcode == Op_LShiftVB || 1352 opcode == Op_RShiftVB || 1353 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1354 bool sign = (opcode != Op_URShiftVB); 1355 assert(vector_len == 0, "required"); 1356 vextendbd(sign, dst, src, 1); 1357 vpmovzxbd(vtmp, shift, 1); 1358 varshiftd(opcode, dst, dst, vtmp, 1); 1359 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1360 vextracti128_high(vtmp, dst); 1361 vpackusdw(dst, dst, vtmp, 0); 1362 } 1363 1364 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1365 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1366 assert(opcode == Op_LShiftVB || 1367 opcode == Op_RShiftVB || 1368 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1369 bool sign = (opcode != Op_URShiftVB); 1370 int ext_vector_len = vector_len + 1; 1371 vextendbw(sign, dst, src, ext_vector_len); 1372 vpmovzxbw(vtmp, shift, ext_vector_len); 1373 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1374 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1375 if (vector_len == 0) { 1376 vextracti128_high(vtmp, dst); 1377 vpackuswb(dst, dst, vtmp, vector_len); 1378 } else { 1379 vextracti64x4_high(vtmp, dst); 1380 vpackuswb(dst, dst, vtmp, vector_len); 1381 vpermq(dst, dst, 0xD8, vector_len); 1382 } 1383 } 1384 1385 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1386 switch(typ) { 1387 case T_BYTE: 1388 pinsrb(dst, val, idx); 1389 break; 1390 case T_SHORT: 1391 pinsrw(dst, val, idx); 1392 break; 1393 case T_INT: 1394 pinsrd(dst, val, idx); 1395 break; 1396 case T_LONG: 1397 pinsrq(dst, val, idx); 1398 break; 1399 default: 1400 assert(false,"Should not reach here."); 1401 break; 1402 } 1403 } 1404 1405 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1406 switch(typ) { 1407 case T_BYTE: 1408 vpinsrb(dst, src, val, idx); 1409 break; 1410 case T_SHORT: 1411 vpinsrw(dst, src, val, idx); 1412 break; 1413 case T_INT: 1414 vpinsrd(dst, src, val, idx); 1415 break; 1416 case T_LONG: 1417 vpinsrq(dst, src, val, idx); 1418 break; 1419 default: 1420 assert(false,"Should not reach here."); 1421 break; 1422 } 1423 } 1424 1425 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst, 1426 Register base, Register idx_base, 1427 Register mask, Register mask_idx, 1428 Register rtmp, int vlen_enc) { 1429 vpxor(dst, dst, dst, vlen_enc); 1430 if (elem_bt == T_SHORT) { 1431 for (int i = 0; i < 4; i++) { 1432 // dst[i] = mask[i] ? src[idx_base[i]] : 0 1433 Label skip_load; 1434 btq(mask, mask_idx); 1435 jccb(Assembler::carryClear, skip_load); 1436 movl(rtmp, Address(idx_base, i * 4)); 1437 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1438 bind(skip_load); 1439 incq(mask_idx); 1440 } 1441 } else { 1442 assert(elem_bt == T_BYTE, ""); 1443 for (int i = 0; i < 8; i++) { 1444 // dst[i] = mask[i] ? src[idx_base[i]] : 0 1445 Label skip_load; 1446 btq(mask, mask_idx); 1447 jccb(Assembler::carryClear, skip_load); 1448 movl(rtmp, Address(idx_base, i * 4)); 1449 pinsrb(dst, Address(base, rtmp), i); 1450 bind(skip_load); 1451 incq(mask_idx); 1452 } 1453 } 1454 } 1455 1456 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst, 1457 Register base, Register idx_base, 1458 Register rtmp, int vlen_enc) { 1459 vpxor(dst, dst, dst, vlen_enc); 1460 if (elem_bt == T_SHORT) { 1461 for (int i = 0; i < 4; i++) { 1462 // dst[i] = src[idx_base[i]] 1463 movl(rtmp, Address(idx_base, i * 4)); 1464 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1465 } 1466 } else { 1467 assert(elem_bt == T_BYTE, ""); 1468 for (int i = 0; i < 8; i++) { 1469 // dst[i] = src[idx_base[i]] 1470 movl(rtmp, Address(idx_base, i * 4)); 1471 pinsrb(dst, Address(base, rtmp), i); 1472 } 1473 } 1474 } 1475 1476 /* 1477 * Gather using hybrid algorithm, first partially unroll scalar loop 1478 * to accumulate values from gather indices into a quad-word(64bit) slice. 1479 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1480 * permutation to place the slice into appropriate vector lane 1481 * locations in destination vector. Following pseudo code describes the 1482 * algorithm in detail: 1483 * 1484 * DST_VEC = ZERO_VEC 1485 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1486 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1487 * FOREACH_ITER: 1488 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1489 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1490 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1491 * PERM_INDEX = PERM_INDEX - TWO_VEC 1492 * 1493 * With each iteration, doubleword permute indices (0,1) corresponding 1494 * to gathered quadword gets right shifted by two lane positions. 1495 * 1496 */ 1497 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1498 Register base, Register idx_base, 1499 Register mask, XMMRegister xtmp1, 1500 XMMRegister xtmp2, XMMRegister temp_dst, 1501 Register rtmp, Register mask_idx, 1502 Register length, int vector_len, int vlen_enc) { 1503 Label GATHER8_LOOP; 1504 assert(is_subword_type(elem_ty), ""); 1505 movl(length, vector_len); 1506 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1507 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1508 vallones(xtmp2, vlen_enc); 1509 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1510 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1511 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1512 1513 bind(GATHER8_LOOP); 1514 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1515 if (mask == noreg) { 1516 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc); 1517 } else { 1518 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc); 1519 } 1520 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1521 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1522 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1523 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1524 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1525 vpor(dst, dst, temp_dst, vlen_enc); 1526 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1527 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1528 jcc(Assembler::notEqual, GATHER8_LOOP); 1529 } 1530 1531 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1532 switch(typ) { 1533 case T_INT: 1534 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1535 break; 1536 case T_FLOAT: 1537 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1538 break; 1539 case T_LONG: 1540 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1541 break; 1542 case T_DOUBLE: 1543 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1544 break; 1545 default: 1546 assert(false,"Should not reach here."); 1547 break; 1548 } 1549 } 1550 1551 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1552 switch(typ) { 1553 case T_INT: 1554 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1555 break; 1556 case T_FLOAT: 1557 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1558 break; 1559 case T_LONG: 1560 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1561 break; 1562 case T_DOUBLE: 1563 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1564 break; 1565 default: 1566 assert(false,"Should not reach here."); 1567 break; 1568 } 1569 } 1570 1571 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1572 switch(typ) { 1573 case T_INT: 1574 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1575 break; 1576 case T_FLOAT: 1577 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1578 break; 1579 case T_LONG: 1580 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1581 break; 1582 case T_DOUBLE: 1583 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1584 break; 1585 default: 1586 assert(false,"Should not reach here."); 1587 break; 1588 } 1589 } 1590 1591 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1592 if (vlen_in_bytes <= 16) { 1593 pxor (dst, dst); 1594 psubb(dst, src); 1595 switch (elem_bt) { 1596 case T_BYTE: /* nothing to do */ break; 1597 case T_SHORT: pmovsxbw(dst, dst); break; 1598 case T_INT: pmovsxbd(dst, dst); break; 1599 case T_FLOAT: pmovsxbd(dst, dst); break; 1600 case T_LONG: pmovsxbq(dst, dst); break; 1601 case T_DOUBLE: pmovsxbq(dst, dst); break; 1602 1603 default: assert(false, "%s", type2name(elem_bt)); 1604 } 1605 } else { 1606 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1607 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1608 1609 vpxor (dst, dst, dst, vlen_enc); 1610 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1611 1612 switch (elem_bt) { 1613 case T_BYTE: /* nothing to do */ break; 1614 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1615 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1616 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1617 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1618 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1619 1620 default: assert(false, "%s", type2name(elem_bt)); 1621 } 1622 } 1623 } 1624 1625 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1626 if (novlbwdq) { 1627 vpmovsxbd(xtmp, src, vlen_enc); 1628 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1629 Assembler::eq, true, vlen_enc, noreg); 1630 } else { 1631 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1632 vpsubb(xtmp, xtmp, src, vlen_enc); 1633 evpmovb2m(dst, xtmp, vlen_enc); 1634 } 1635 } 1636 1637 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) { 1638 if (is_integral_type(bt)) { 1639 switch (vlen_in_bytes) { 1640 case 4: movdl(dst, src); break; 1641 case 8: movq(dst, src); break; 1642 case 16: movdqu(dst, src); break; 1643 case 32: vmovdqu(dst, src); break; 1644 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1645 default: ShouldNotReachHere(); 1646 } 1647 } else { 1648 switch (vlen_in_bytes) { 1649 case 4: movflt(dst, src); break; 1650 case 8: movdbl(dst, src); break; 1651 case 16: movups(dst, src); break; 1652 case 32: vmovups(dst, src, Assembler::AVX_256bit); break; 1653 case 64: vmovups(dst, src, Assembler::AVX_512bit); break; 1654 default: ShouldNotReachHere(); 1655 } 1656 } 1657 } 1658 1659 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1660 assert(rscratch != noreg || always_reachable(src), "missing"); 1661 1662 if (reachable(src)) { 1663 load_vector(bt, dst, as_Address(src), vlen_in_bytes); 1664 } else { 1665 lea(rscratch, src); 1666 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes); 1667 } 1668 } 1669 1670 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1671 int vlen_enc = vector_length_encoding(vlen); 1672 if (VM_Version::supports_avx()) { 1673 if (bt == T_LONG) { 1674 if (VM_Version::supports_avx2()) { 1675 vpbroadcastq(dst, src, vlen_enc); 1676 } else { 1677 vmovddup(dst, src, vlen_enc); 1678 } 1679 } else if (bt == T_DOUBLE) { 1680 if (vlen_enc != Assembler::AVX_128bit) { 1681 vbroadcastsd(dst, src, vlen_enc, noreg); 1682 } else { 1683 vmovddup(dst, src, vlen_enc); 1684 } 1685 } else { 1686 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1687 vpbroadcastd(dst, src, vlen_enc); 1688 } else { 1689 vbroadcastss(dst, src, vlen_enc); 1690 } 1691 } 1692 } else if (VM_Version::supports_sse3()) { 1693 movddup(dst, src); 1694 } else { 1695 load_vector(bt, dst, src, vlen); 1696 } 1697 } 1698 1699 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1700 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1701 int offset = exact_log2(type2aelembytes(bt)) << 6; 1702 if (is_floating_point_type(bt)) { 1703 offset += 128; 1704 } 1705 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1706 load_vector(T_BYTE, dst, addr, vlen_in_bytes); 1707 } 1708 1709 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1710 1711 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1712 int vector_len = Assembler::AVX_128bit; 1713 1714 switch (opcode) { 1715 case Op_AndReductionV: pand(dst, src); break; 1716 case Op_OrReductionV: por (dst, src); break; 1717 case Op_XorReductionV: pxor(dst, src); break; 1718 case Op_MinReductionV: 1719 switch (typ) { 1720 case T_BYTE: pminsb(dst, src); break; 1721 case T_SHORT: pminsw(dst, src); break; 1722 case T_INT: pminsd(dst, src); break; 1723 case T_LONG: assert(UseAVX > 2, "required"); 1724 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1725 default: assert(false, "wrong type"); 1726 } 1727 break; 1728 case Op_MaxReductionV: 1729 switch (typ) { 1730 case T_BYTE: pmaxsb(dst, src); break; 1731 case T_SHORT: pmaxsw(dst, src); break; 1732 case T_INT: pmaxsd(dst, src); break; 1733 case T_LONG: assert(UseAVX > 2, "required"); 1734 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1735 default: assert(false, "wrong type"); 1736 } 1737 break; 1738 case Op_AddReductionVF: addss(dst, src); break; 1739 case Op_AddReductionVD: addsd(dst, src); break; 1740 case Op_AddReductionVI: 1741 switch (typ) { 1742 case T_BYTE: paddb(dst, src); break; 1743 case T_SHORT: paddw(dst, src); break; 1744 case T_INT: paddd(dst, src); break; 1745 default: assert(false, "wrong type"); 1746 } 1747 break; 1748 case Op_AddReductionVL: paddq(dst, src); break; 1749 case Op_MulReductionVF: mulss(dst, src); break; 1750 case Op_MulReductionVD: mulsd(dst, src); break; 1751 case Op_MulReductionVI: 1752 switch (typ) { 1753 case T_SHORT: pmullw(dst, src); break; 1754 case T_INT: pmulld(dst, src); break; 1755 default: assert(false, "wrong type"); 1756 } 1757 break; 1758 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1759 evpmullq(dst, dst, src, vector_len); break; 1760 default: assert(false, "wrong opcode"); 1761 } 1762 } 1763 1764 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1765 switch (opcode) { 1766 case Op_AddReductionVF: addps(dst, src); break; 1767 case Op_AddReductionVD: addpd(dst, src); break; 1768 case Op_MulReductionVF: mulps(dst, src); break; 1769 case Op_MulReductionVD: mulpd(dst, src); break; 1770 default: assert(false, "%s", NodeClassNames[opcode]); 1771 } 1772 } 1773 1774 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1775 int vector_len = Assembler::AVX_256bit; 1776 1777 switch (opcode) { 1778 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1779 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1780 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1781 case Op_MinReductionV: 1782 switch (typ) { 1783 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1784 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1785 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1786 case T_LONG: assert(UseAVX > 2, "required"); 1787 vpminsq(dst, src1, src2, vector_len); break; 1788 default: assert(false, "wrong type"); 1789 } 1790 break; 1791 case Op_MaxReductionV: 1792 switch (typ) { 1793 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1794 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1795 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1796 case T_LONG: assert(UseAVX > 2, "required"); 1797 vpmaxsq(dst, src1, src2, vector_len); break; 1798 default: assert(false, "wrong type"); 1799 } 1800 break; 1801 case Op_AddReductionVI: 1802 switch (typ) { 1803 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1804 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1805 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1806 default: assert(false, "wrong type"); 1807 } 1808 break; 1809 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1810 case Op_MulReductionVI: 1811 switch (typ) { 1812 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1813 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1814 default: assert(false, "wrong type"); 1815 } 1816 break; 1817 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1818 default: assert(false, "wrong opcode"); 1819 } 1820 } 1821 1822 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1823 int vector_len = Assembler::AVX_256bit; 1824 1825 switch (opcode) { 1826 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1827 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1828 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1829 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1830 default: assert(false, "%s", NodeClassNames[opcode]); 1831 } 1832 } 1833 1834 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1835 XMMRegister dst, XMMRegister src, 1836 XMMRegister vtmp1, XMMRegister vtmp2) { 1837 switch (opcode) { 1838 case Op_AddReductionVF: 1839 case Op_MulReductionVF: 1840 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1841 break; 1842 1843 case Op_AddReductionVD: 1844 case Op_MulReductionVD: 1845 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1846 break; 1847 1848 default: assert(false, "wrong opcode"); 1849 } 1850 } 1851 1852 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1853 XMMRegister dst, XMMRegister src, 1854 XMMRegister vtmp1, XMMRegister vtmp2) { 1855 switch (opcode) { 1856 case Op_AddReductionVF: 1857 case Op_MulReductionVF: 1858 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1859 break; 1860 1861 case Op_AddReductionVD: 1862 case Op_MulReductionVD: 1863 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1864 break; 1865 1866 default: assert(false, "%s", NodeClassNames[opcode]); 1867 } 1868 } 1869 1870 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1871 Register dst, Register src1, XMMRegister src2, 1872 XMMRegister vtmp1, XMMRegister vtmp2) { 1873 switch (vlen) { 1874 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1875 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1876 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1877 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1878 1879 default: assert(false, "wrong vector length"); 1880 } 1881 } 1882 1883 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1884 Register dst, Register src1, XMMRegister src2, 1885 XMMRegister vtmp1, XMMRegister vtmp2) { 1886 switch (vlen) { 1887 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1888 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1889 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1890 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1891 1892 default: assert(false, "wrong vector length"); 1893 } 1894 } 1895 1896 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1897 Register dst, Register src1, XMMRegister src2, 1898 XMMRegister vtmp1, XMMRegister vtmp2) { 1899 switch (vlen) { 1900 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1901 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1902 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1903 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1904 1905 default: assert(false, "wrong vector length"); 1906 } 1907 } 1908 1909 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1910 Register dst, Register src1, XMMRegister src2, 1911 XMMRegister vtmp1, XMMRegister vtmp2) { 1912 switch (vlen) { 1913 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1914 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1915 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1916 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1917 1918 default: assert(false, "wrong vector length"); 1919 } 1920 } 1921 1922 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1923 Register dst, Register src1, XMMRegister src2, 1924 XMMRegister vtmp1, XMMRegister vtmp2) { 1925 switch (vlen) { 1926 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1927 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1928 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1929 1930 default: assert(false, "wrong vector length"); 1931 } 1932 } 1933 1934 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1935 switch (vlen) { 1936 case 2: 1937 assert(vtmp2 == xnoreg, ""); 1938 reduce2F(opcode, dst, src, vtmp1); 1939 break; 1940 case 4: 1941 assert(vtmp2 == xnoreg, ""); 1942 reduce4F(opcode, dst, src, vtmp1); 1943 break; 1944 case 8: 1945 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1946 break; 1947 case 16: 1948 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1949 break; 1950 default: assert(false, "wrong vector length"); 1951 } 1952 } 1953 1954 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1955 switch (vlen) { 1956 case 2: 1957 assert(vtmp2 == xnoreg, ""); 1958 reduce2D(opcode, dst, src, vtmp1); 1959 break; 1960 case 4: 1961 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1962 break; 1963 case 8: 1964 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1965 break; 1966 default: assert(false, "wrong vector length"); 1967 } 1968 } 1969 1970 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1971 switch (vlen) { 1972 case 2: 1973 assert(vtmp1 == xnoreg, ""); 1974 assert(vtmp2 == xnoreg, ""); 1975 unorderedReduce2F(opcode, dst, src); 1976 break; 1977 case 4: 1978 assert(vtmp2 == xnoreg, ""); 1979 unorderedReduce4F(opcode, dst, src, vtmp1); 1980 break; 1981 case 8: 1982 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 1983 break; 1984 case 16: 1985 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 1986 break; 1987 default: assert(false, "wrong vector length"); 1988 } 1989 } 1990 1991 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1992 switch (vlen) { 1993 case 2: 1994 assert(vtmp1 == xnoreg, ""); 1995 assert(vtmp2 == xnoreg, ""); 1996 unorderedReduce2D(opcode, dst, src); 1997 break; 1998 case 4: 1999 assert(vtmp2 == xnoreg, ""); 2000 unorderedReduce4D(opcode, dst, src, vtmp1); 2001 break; 2002 case 8: 2003 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2004 break; 2005 default: assert(false, "wrong vector length"); 2006 } 2007 } 2008 2009 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2010 if (opcode == Op_AddReductionVI) { 2011 if (vtmp1 != src2) { 2012 movdqu(vtmp1, src2); 2013 } 2014 phaddd(vtmp1, vtmp1); 2015 } else { 2016 pshufd(vtmp1, src2, 0x1); 2017 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2018 } 2019 movdl(vtmp2, src1); 2020 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2021 movdl(dst, vtmp1); 2022 } 2023 2024 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2025 if (opcode == Op_AddReductionVI) { 2026 if (vtmp1 != src2) { 2027 movdqu(vtmp1, src2); 2028 } 2029 phaddd(vtmp1, src2); 2030 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2031 } else { 2032 pshufd(vtmp2, src2, 0xE); 2033 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2034 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2035 } 2036 } 2037 2038 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2039 if (opcode == Op_AddReductionVI) { 2040 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2041 vextracti128_high(vtmp2, vtmp1); 2042 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2043 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2044 } else { 2045 vextracti128_high(vtmp1, src2); 2046 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2047 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2048 } 2049 } 2050 2051 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2052 vextracti64x4_high(vtmp2, src2); 2053 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2054 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2055 } 2056 2057 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2058 pshufd(vtmp2, src2, 0x1); 2059 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2060 movdqu(vtmp1, vtmp2); 2061 psrldq(vtmp1, 2); 2062 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2063 movdqu(vtmp2, vtmp1); 2064 psrldq(vtmp2, 1); 2065 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2066 movdl(vtmp2, src1); 2067 pmovsxbd(vtmp1, vtmp1); 2068 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2069 pextrb(dst, vtmp1, 0x0); 2070 movsbl(dst, dst); 2071 } 2072 2073 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2074 pshufd(vtmp1, src2, 0xE); 2075 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2076 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2077 } 2078 2079 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2080 vextracti128_high(vtmp2, src2); 2081 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2082 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2083 } 2084 2085 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2086 vextracti64x4_high(vtmp1, src2); 2087 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2088 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2089 } 2090 2091 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2092 pmovsxbw(vtmp2, src2); 2093 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2094 } 2095 2096 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2097 if (UseAVX > 1) { 2098 int vector_len = Assembler::AVX_256bit; 2099 vpmovsxbw(vtmp1, src2, vector_len); 2100 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2101 } else { 2102 pmovsxbw(vtmp2, src2); 2103 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2104 pshufd(vtmp2, src2, 0x1); 2105 pmovsxbw(vtmp2, src2); 2106 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2107 } 2108 } 2109 2110 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2111 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2112 int vector_len = Assembler::AVX_512bit; 2113 vpmovsxbw(vtmp1, src2, vector_len); 2114 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2115 } else { 2116 assert(UseAVX >= 2,"Should not reach here."); 2117 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2118 vextracti128_high(vtmp2, src2); 2119 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2120 } 2121 } 2122 2123 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2124 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2125 vextracti64x4_high(vtmp2, src2); 2126 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2127 } 2128 2129 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2130 if (opcode == Op_AddReductionVI) { 2131 if (vtmp1 != src2) { 2132 movdqu(vtmp1, src2); 2133 } 2134 phaddw(vtmp1, vtmp1); 2135 phaddw(vtmp1, vtmp1); 2136 } else { 2137 pshufd(vtmp2, src2, 0x1); 2138 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2139 movdqu(vtmp1, vtmp2); 2140 psrldq(vtmp1, 2); 2141 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2142 } 2143 movdl(vtmp2, src1); 2144 pmovsxwd(vtmp1, vtmp1); 2145 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2146 pextrw(dst, vtmp1, 0x0); 2147 movswl(dst, dst); 2148 } 2149 2150 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2151 if (opcode == Op_AddReductionVI) { 2152 if (vtmp1 != src2) { 2153 movdqu(vtmp1, src2); 2154 } 2155 phaddw(vtmp1, src2); 2156 } else { 2157 pshufd(vtmp1, src2, 0xE); 2158 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2159 } 2160 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2161 } 2162 2163 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2164 if (opcode == Op_AddReductionVI) { 2165 int vector_len = Assembler::AVX_256bit; 2166 vphaddw(vtmp2, src2, src2, vector_len); 2167 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2168 } else { 2169 vextracti128_high(vtmp2, src2); 2170 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2171 } 2172 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2173 } 2174 2175 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2176 int vector_len = Assembler::AVX_256bit; 2177 vextracti64x4_high(vtmp1, src2); 2178 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2179 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2180 } 2181 2182 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2183 pshufd(vtmp2, src2, 0xE); 2184 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2185 movdq(vtmp1, src1); 2186 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2187 movdq(dst, vtmp1); 2188 } 2189 2190 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2191 vextracti128_high(vtmp1, src2); 2192 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2193 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2194 } 2195 2196 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2197 vextracti64x4_high(vtmp2, src2); 2198 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2199 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2200 } 2201 2202 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2203 mov64(temp, -1L); 2204 bzhiq(temp, temp, len); 2205 kmovql(dst, temp); 2206 } 2207 2208 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2209 reduce_operation_128(T_FLOAT, opcode, dst, src); 2210 pshufd(vtmp, src, 0x1); 2211 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2212 } 2213 2214 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2215 reduce2F(opcode, dst, src, vtmp); 2216 pshufd(vtmp, src, 0x2); 2217 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2218 pshufd(vtmp, src, 0x3); 2219 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2220 } 2221 2222 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2223 reduce4F(opcode, dst, src, vtmp2); 2224 vextractf128_high(vtmp2, src); 2225 reduce4F(opcode, dst, vtmp2, vtmp1); 2226 } 2227 2228 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2229 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2230 vextracti64x4_high(vtmp1, src); 2231 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2232 } 2233 2234 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2235 pshufd(dst, src, 0x1); 2236 reduce_operation_128(T_FLOAT, opcode, dst, src); 2237 } 2238 2239 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2240 pshufd(vtmp, src, 0xE); 2241 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2242 unorderedReduce2F(opcode, dst, vtmp); 2243 } 2244 2245 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2246 vextractf128_high(vtmp1, src); 2247 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2248 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2249 } 2250 2251 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2252 vextractf64x4_high(vtmp2, src); 2253 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2254 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2255 } 2256 2257 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2258 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2259 pshufd(vtmp, src, 0xE); 2260 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2261 } 2262 2263 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2264 reduce2D(opcode, dst, src, vtmp2); 2265 vextractf128_high(vtmp2, src); 2266 reduce2D(opcode, dst, vtmp2, vtmp1); 2267 } 2268 2269 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2270 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2271 vextracti64x4_high(vtmp1, src); 2272 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2273 } 2274 2275 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2276 pshufd(dst, src, 0xE); 2277 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2278 } 2279 2280 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2281 vextractf128_high(vtmp, src); 2282 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2283 unorderedReduce2D(opcode, dst, vtmp); 2284 } 2285 2286 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2287 vextractf64x4_high(vtmp2, src); 2288 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2289 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2290 } 2291 2292 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2293 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2294 } 2295 2296 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2297 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2298 } 2299 2300 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2301 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2302 } 2303 2304 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2305 int vec_enc) { 2306 switch(elem_bt) { 2307 case T_INT: 2308 case T_FLOAT: 2309 vmaskmovps(dst, src, mask, vec_enc); 2310 break; 2311 case T_LONG: 2312 case T_DOUBLE: 2313 vmaskmovpd(dst, src, mask, vec_enc); 2314 break; 2315 default: 2316 fatal("Unsupported type %s", type2name(elem_bt)); 2317 break; 2318 } 2319 } 2320 2321 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2322 int vec_enc) { 2323 switch(elem_bt) { 2324 case T_INT: 2325 case T_FLOAT: 2326 vmaskmovps(dst, src, mask, vec_enc); 2327 break; 2328 case T_LONG: 2329 case T_DOUBLE: 2330 vmaskmovpd(dst, src, mask, vec_enc); 2331 break; 2332 default: 2333 fatal("Unsupported type %s", type2name(elem_bt)); 2334 break; 2335 } 2336 } 2337 2338 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2339 XMMRegister dst, XMMRegister src, 2340 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2341 XMMRegister xmm_0, XMMRegister xmm_1) { 2342 const int permconst[] = {1, 14}; 2343 XMMRegister wsrc = src; 2344 XMMRegister wdst = xmm_0; 2345 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2346 2347 int vlen_enc = Assembler::AVX_128bit; 2348 if (vlen == 16) { 2349 vlen_enc = Assembler::AVX_256bit; 2350 } 2351 2352 for (int i = log2(vlen) - 1; i >=0; i--) { 2353 if (i == 0 && !is_dst_valid) { 2354 wdst = dst; 2355 } 2356 if (i == 3) { 2357 vextracti64x4_high(wtmp, wsrc); 2358 } else if (i == 2) { 2359 vextracti128_high(wtmp, wsrc); 2360 } else { // i = [0,1] 2361 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2362 } 2363 2364 if (VM_Version::supports_avx10_2()) { 2365 vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc); 2366 } else { 2367 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2368 } 2369 wsrc = wdst; 2370 vlen_enc = Assembler::AVX_128bit; 2371 } 2372 if (is_dst_valid) { 2373 if (VM_Version::supports_avx10_2()) { 2374 vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit); 2375 } else { 2376 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2377 } 2378 } 2379 } 2380 2381 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2382 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2383 XMMRegister xmm_0, XMMRegister xmm_1) { 2384 XMMRegister wsrc = src; 2385 XMMRegister wdst = xmm_0; 2386 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2387 int vlen_enc = Assembler::AVX_128bit; 2388 if (vlen == 8) { 2389 vlen_enc = Assembler::AVX_256bit; 2390 } 2391 for (int i = log2(vlen) - 1; i >=0; i--) { 2392 if (i == 0 && !is_dst_valid) { 2393 wdst = dst; 2394 } 2395 if (i == 1) { 2396 vextracti128_high(wtmp, wsrc); 2397 } else if (i == 2) { 2398 vextracti64x4_high(wtmp, wsrc); 2399 } else { 2400 assert(i == 0, "%d", i); 2401 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2402 } 2403 2404 if (VM_Version::supports_avx10_2()) { 2405 vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc); 2406 } else { 2407 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2408 } 2409 2410 wsrc = wdst; 2411 vlen_enc = Assembler::AVX_128bit; 2412 } 2413 2414 if (is_dst_valid) { 2415 if (VM_Version::supports_avx10_2()) { 2416 vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit); 2417 } else { 2418 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2419 } 2420 } 2421 } 2422 2423 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2424 switch (bt) { 2425 case T_BYTE: pextrb(dst, src, idx); break; 2426 case T_SHORT: pextrw(dst, src, idx); break; 2427 case T_INT: pextrd(dst, src, idx); break; 2428 case T_LONG: pextrq(dst, src, idx); break; 2429 2430 default: 2431 assert(false,"Should not reach here."); 2432 break; 2433 } 2434 } 2435 2436 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2437 int esize = type2aelembytes(typ); 2438 int elem_per_lane = 16/esize; 2439 int lane = elemindex / elem_per_lane; 2440 int eindex = elemindex % elem_per_lane; 2441 2442 if (lane >= 2) { 2443 assert(UseAVX > 2, "required"); 2444 vextractf32x4(dst, src, lane & 3); 2445 return dst; 2446 } else if (lane > 0) { 2447 assert(UseAVX > 0, "required"); 2448 vextractf128(dst, src, lane); 2449 return dst; 2450 } else { 2451 return src; 2452 } 2453 } 2454 2455 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2456 if (typ == T_BYTE) { 2457 movsbl(dst, dst); 2458 } else if (typ == T_SHORT) { 2459 movswl(dst, dst); 2460 } 2461 } 2462 2463 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2464 int esize = type2aelembytes(typ); 2465 int elem_per_lane = 16/esize; 2466 int eindex = elemindex % elem_per_lane; 2467 assert(is_integral_type(typ),"required"); 2468 2469 if (eindex == 0) { 2470 if (typ == T_LONG) { 2471 movq(dst, src); 2472 } else { 2473 movdl(dst, src); 2474 movsxl(typ, dst); 2475 } 2476 } else { 2477 extract(typ, dst, src, eindex); 2478 movsxl(typ, dst); 2479 } 2480 } 2481 2482 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2483 int esize = type2aelembytes(typ); 2484 int elem_per_lane = 16/esize; 2485 int eindex = elemindex % elem_per_lane; 2486 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2487 2488 if (eindex == 0) { 2489 movq(dst, src); 2490 } else { 2491 if (typ == T_FLOAT) { 2492 if (UseAVX == 0) { 2493 movdqu(dst, src); 2494 shufps(dst, dst, eindex); 2495 } else { 2496 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2497 } 2498 } else { 2499 if (UseAVX == 0) { 2500 movdqu(dst, src); 2501 psrldq(dst, eindex*esize); 2502 } else { 2503 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2504 } 2505 movq(dst, dst); 2506 } 2507 } 2508 // Zero upper bits 2509 if (typ == T_FLOAT) { 2510 if (UseAVX == 0) { 2511 assert(vtmp != xnoreg, "required."); 2512 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2513 pand(dst, vtmp); 2514 } else { 2515 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2516 } 2517 } 2518 } 2519 2520 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2521 switch(typ) { 2522 case T_BYTE: 2523 case T_BOOLEAN: 2524 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2525 break; 2526 case T_SHORT: 2527 case T_CHAR: 2528 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2529 break; 2530 case T_INT: 2531 case T_FLOAT: 2532 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2533 break; 2534 case T_LONG: 2535 case T_DOUBLE: 2536 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2537 break; 2538 default: 2539 assert(false,"Should not reach here."); 2540 break; 2541 } 2542 } 2543 2544 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2545 assert(rscratch != noreg || always_reachable(src2), "missing"); 2546 2547 switch(typ) { 2548 case T_BOOLEAN: 2549 case T_BYTE: 2550 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2551 break; 2552 case T_CHAR: 2553 case T_SHORT: 2554 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2555 break; 2556 case T_INT: 2557 case T_FLOAT: 2558 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2559 break; 2560 case T_LONG: 2561 case T_DOUBLE: 2562 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2563 break; 2564 default: 2565 assert(false,"Should not reach here."); 2566 break; 2567 } 2568 } 2569 2570 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2571 switch(typ) { 2572 case T_BYTE: 2573 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2574 break; 2575 case T_SHORT: 2576 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2577 break; 2578 case T_INT: 2579 case T_FLOAT: 2580 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2581 break; 2582 case T_LONG: 2583 case T_DOUBLE: 2584 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2585 break; 2586 default: 2587 assert(false,"Should not reach here."); 2588 break; 2589 } 2590 } 2591 2592 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2593 assert(vlen_in_bytes <= 32, ""); 2594 int esize = type2aelembytes(bt); 2595 if (vlen_in_bytes == 32) { 2596 assert(vtmp == xnoreg, "required."); 2597 if (esize >= 4) { 2598 vtestps(src1, src2, AVX_256bit); 2599 } else { 2600 vptest(src1, src2, AVX_256bit); 2601 } 2602 return; 2603 } 2604 if (vlen_in_bytes < 16) { 2605 // Duplicate the lower part to fill the whole register, 2606 // Don't need to do so for src2 2607 assert(vtmp != xnoreg, "required"); 2608 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2609 pshufd(vtmp, src1, shuffle_imm); 2610 } else { 2611 assert(vtmp == xnoreg, "required"); 2612 vtmp = src1; 2613 } 2614 if (esize >= 4 && VM_Version::supports_avx()) { 2615 vtestps(vtmp, src2, AVX_128bit); 2616 } else { 2617 ptest(vtmp, src2); 2618 } 2619 } 2620 2621 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2622 #ifdef ASSERT 2623 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2624 bool is_bw_supported = VM_Version::supports_avx512bw(); 2625 if (is_bw && !is_bw_supported) { 2626 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2627 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2628 "XMM register should be 0-15"); 2629 } 2630 #endif // ASSERT 2631 switch (elem_bt) { 2632 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2633 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2634 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2635 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2636 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2637 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2638 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2639 } 2640 } 2641 2642 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2643 assert(UseAVX >= 2, "required"); 2644 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2645 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2646 if ((UseAVX > 2) && 2647 (!is_bw || VM_Version::supports_avx512bw()) && 2648 (!is_vl || VM_Version::supports_avx512vl())) { 2649 switch (elem_bt) { 2650 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2651 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2652 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2653 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2654 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2655 } 2656 } else { 2657 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2658 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2659 switch (elem_bt) { 2660 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2661 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2662 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2663 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2664 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2665 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2666 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2667 } 2668 } 2669 } 2670 2671 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2672 switch (to_elem_bt) { 2673 case T_SHORT: 2674 vpmovsxbw(dst, src, vlen_enc); 2675 break; 2676 case T_INT: 2677 vpmovsxbd(dst, src, vlen_enc); 2678 break; 2679 case T_FLOAT: 2680 vpmovsxbd(dst, src, vlen_enc); 2681 vcvtdq2ps(dst, dst, vlen_enc); 2682 break; 2683 case T_LONG: 2684 vpmovsxbq(dst, src, vlen_enc); 2685 break; 2686 case T_DOUBLE: { 2687 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2688 vpmovsxbd(dst, src, mid_vlen_enc); 2689 vcvtdq2pd(dst, dst, vlen_enc); 2690 break; 2691 } 2692 default: 2693 fatal("Unsupported type %s", type2name(to_elem_bt)); 2694 break; 2695 } 2696 } 2697 2698 //------------------------------------------------------------------------------------------- 2699 2700 // IndexOf for constant substrings with size >= 8 chars 2701 // which don't need to be loaded through stack. 2702 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2703 Register cnt1, Register cnt2, 2704 int int_cnt2, Register result, 2705 XMMRegister vec, Register tmp, 2706 int ae) { 2707 ShortBranchVerifier sbv(this); 2708 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2709 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2710 2711 // This method uses the pcmpestri instruction with bound registers 2712 // inputs: 2713 // xmm - substring 2714 // rax - substring length (elements count) 2715 // mem - scanned string 2716 // rdx - string length (elements count) 2717 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2718 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2719 // outputs: 2720 // rcx - matched index in string 2721 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2722 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2723 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2724 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2725 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2726 2727 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2728 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2729 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2730 2731 // Note, inline_string_indexOf() generates checks: 2732 // if (substr.count > string.count) return -1; 2733 // if (substr.count == 0) return 0; 2734 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2735 2736 // Load substring. 2737 if (ae == StrIntrinsicNode::UL) { 2738 pmovzxbw(vec, Address(str2, 0)); 2739 } else { 2740 movdqu(vec, Address(str2, 0)); 2741 } 2742 movl(cnt2, int_cnt2); 2743 movptr(result, str1); // string addr 2744 2745 if (int_cnt2 > stride) { 2746 jmpb(SCAN_TO_SUBSTR); 2747 2748 // Reload substr for rescan, this code 2749 // is executed only for large substrings (> 8 chars) 2750 bind(RELOAD_SUBSTR); 2751 if (ae == StrIntrinsicNode::UL) { 2752 pmovzxbw(vec, Address(str2, 0)); 2753 } else { 2754 movdqu(vec, Address(str2, 0)); 2755 } 2756 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2757 2758 bind(RELOAD_STR); 2759 // We came here after the beginning of the substring was 2760 // matched but the rest of it was not so we need to search 2761 // again. Start from the next element after the previous match. 2762 2763 // cnt2 is number of substring reminding elements and 2764 // cnt1 is number of string reminding elements when cmp failed. 2765 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2766 subl(cnt1, cnt2); 2767 addl(cnt1, int_cnt2); 2768 movl(cnt2, int_cnt2); // Now restore cnt2 2769 2770 decrementl(cnt1); // Shift to next element 2771 cmpl(cnt1, cnt2); 2772 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2773 2774 addptr(result, (1<<scale1)); 2775 2776 } // (int_cnt2 > 8) 2777 2778 // Scan string for start of substr in 16-byte vectors 2779 bind(SCAN_TO_SUBSTR); 2780 pcmpestri(vec, Address(result, 0), mode); 2781 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2782 subl(cnt1, stride); 2783 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2784 cmpl(cnt1, cnt2); 2785 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2786 addptr(result, 16); 2787 jmpb(SCAN_TO_SUBSTR); 2788 2789 // Found a potential substr 2790 bind(FOUND_CANDIDATE); 2791 // Matched whole vector if first element matched (tmp(rcx) == 0). 2792 if (int_cnt2 == stride) { 2793 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2794 } else { // int_cnt2 > 8 2795 jccb(Assembler::overflow, FOUND_SUBSTR); 2796 } 2797 // After pcmpestri tmp(rcx) contains matched element index 2798 // Compute start addr of substr 2799 lea(result, Address(result, tmp, scale1)); 2800 2801 // Make sure string is still long enough 2802 subl(cnt1, tmp); 2803 cmpl(cnt1, cnt2); 2804 if (int_cnt2 == stride) { 2805 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2806 } else { // int_cnt2 > 8 2807 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2808 } 2809 // Left less then substring. 2810 2811 bind(RET_NOT_FOUND); 2812 movl(result, -1); 2813 jmp(EXIT); 2814 2815 if (int_cnt2 > stride) { 2816 // This code is optimized for the case when whole substring 2817 // is matched if its head is matched. 2818 bind(MATCH_SUBSTR_HEAD); 2819 pcmpestri(vec, Address(result, 0), mode); 2820 // Reload only string if does not match 2821 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2822 2823 Label CONT_SCAN_SUBSTR; 2824 // Compare the rest of substring (> 8 chars). 2825 bind(FOUND_SUBSTR); 2826 // First 8 chars are already matched. 2827 negptr(cnt2); 2828 addptr(cnt2, stride); 2829 2830 bind(SCAN_SUBSTR); 2831 subl(cnt1, stride); 2832 cmpl(cnt2, -stride); // Do not read beyond substring 2833 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2834 // Back-up strings to avoid reading beyond substring: 2835 // cnt1 = cnt1 - cnt2 + 8 2836 addl(cnt1, cnt2); // cnt2 is negative 2837 addl(cnt1, stride); 2838 movl(cnt2, stride); negptr(cnt2); 2839 bind(CONT_SCAN_SUBSTR); 2840 if (int_cnt2 < (int)G) { 2841 int tail_off1 = int_cnt2<<scale1; 2842 int tail_off2 = int_cnt2<<scale2; 2843 if (ae == StrIntrinsicNode::UL) { 2844 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2845 } else { 2846 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2847 } 2848 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2849 } else { 2850 // calculate index in register to avoid integer overflow (int_cnt2*2) 2851 movl(tmp, int_cnt2); 2852 addptr(tmp, cnt2); 2853 if (ae == StrIntrinsicNode::UL) { 2854 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2855 } else { 2856 movdqu(vec, Address(str2, tmp, scale2, 0)); 2857 } 2858 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2859 } 2860 // Need to reload strings pointers if not matched whole vector 2861 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2862 addptr(cnt2, stride); 2863 jcc(Assembler::negative, SCAN_SUBSTR); 2864 // Fall through if found full substring 2865 2866 } // (int_cnt2 > 8) 2867 2868 bind(RET_FOUND); 2869 // Found result if we matched full small substring. 2870 // Compute substr offset 2871 subptr(result, str1); 2872 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2873 shrl(result, 1); // index 2874 } 2875 bind(EXIT); 2876 2877 } // string_indexofC8 2878 2879 // Small strings are loaded through stack if they cross page boundary. 2880 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2881 Register cnt1, Register cnt2, 2882 int int_cnt2, Register result, 2883 XMMRegister vec, Register tmp, 2884 int ae) { 2885 ShortBranchVerifier sbv(this); 2886 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2887 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2888 2889 // 2890 // int_cnt2 is length of small (< 8 chars) constant substring 2891 // or (-1) for non constant substring in which case its length 2892 // is in cnt2 register. 2893 // 2894 // Note, inline_string_indexOf() generates checks: 2895 // if (substr.count > string.count) return -1; 2896 // if (substr.count == 0) return 0; 2897 // 2898 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2899 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2900 // This method uses the pcmpestri instruction with bound registers 2901 // inputs: 2902 // xmm - substring 2903 // rax - substring length (elements count) 2904 // mem - scanned string 2905 // rdx - string length (elements count) 2906 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2907 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2908 // outputs: 2909 // rcx - matched index in string 2910 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2911 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2912 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2913 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2914 2915 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2916 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2917 FOUND_CANDIDATE; 2918 2919 { //======================================================== 2920 // We don't know where these strings are located 2921 // and we can't read beyond them. Load them through stack. 2922 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2923 2924 movptr(tmp, rsp); // save old SP 2925 2926 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2927 if (int_cnt2 == (1>>scale2)) { // One byte 2928 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2929 load_unsigned_byte(result, Address(str2, 0)); 2930 movdl(vec, result); // move 32 bits 2931 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2932 // Not enough header space in 32-bit VM: 12+3 = 15. 2933 movl(result, Address(str2, -1)); 2934 shrl(result, 8); 2935 movdl(vec, result); // move 32 bits 2936 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2937 load_unsigned_short(result, Address(str2, 0)); 2938 movdl(vec, result); // move 32 bits 2939 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2940 movdl(vec, Address(str2, 0)); // move 32 bits 2941 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2942 movq(vec, Address(str2, 0)); // move 64 bits 2943 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2944 // Array header size is 12 bytes in 32-bit VM 2945 // + 6 bytes for 3 chars == 18 bytes, 2946 // enough space to load vec and shift. 2947 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2948 if (ae == StrIntrinsicNode::UL) { 2949 int tail_off = int_cnt2-8; 2950 pmovzxbw(vec, Address(str2, tail_off)); 2951 psrldq(vec, -2*tail_off); 2952 } 2953 else { 2954 int tail_off = int_cnt2*(1<<scale2); 2955 movdqu(vec, Address(str2, tail_off-16)); 2956 psrldq(vec, 16-tail_off); 2957 } 2958 } 2959 } else { // not constant substring 2960 cmpl(cnt2, stride); 2961 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2962 2963 // We can read beyond string if srt+16 does not cross page boundary 2964 // since heaps are aligned and mapped by pages. 2965 assert(os::vm_page_size() < (int)G, "default page should be small"); 2966 movl(result, str2); // We need only low 32 bits 2967 andl(result, ((int)os::vm_page_size()-1)); 2968 cmpl(result, ((int)os::vm_page_size()-16)); 2969 jccb(Assembler::belowEqual, CHECK_STR); 2970 2971 // Move small strings to stack to allow load 16 bytes into vec. 2972 subptr(rsp, 16); 2973 int stk_offset = wordSize-(1<<scale2); 2974 push(cnt2); 2975 2976 bind(COPY_SUBSTR); 2977 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2978 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2979 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2980 } else if (ae == StrIntrinsicNode::UU) { 2981 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2982 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2983 } 2984 decrement(cnt2); 2985 jccb(Assembler::notZero, COPY_SUBSTR); 2986 2987 pop(cnt2); 2988 movptr(str2, rsp); // New substring address 2989 } // non constant 2990 2991 bind(CHECK_STR); 2992 cmpl(cnt1, stride); 2993 jccb(Assembler::aboveEqual, BIG_STRINGS); 2994 2995 // Check cross page boundary. 2996 movl(result, str1); // We need only low 32 bits 2997 andl(result, ((int)os::vm_page_size()-1)); 2998 cmpl(result, ((int)os::vm_page_size()-16)); 2999 jccb(Assembler::belowEqual, BIG_STRINGS); 3000 3001 subptr(rsp, 16); 3002 int stk_offset = -(1<<scale1); 3003 if (int_cnt2 < 0) { // not constant 3004 push(cnt2); 3005 stk_offset += wordSize; 3006 } 3007 movl(cnt2, cnt1); 3008 3009 bind(COPY_STR); 3010 if (ae == StrIntrinsicNode::LL) { 3011 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3012 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3013 } else { 3014 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3015 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3016 } 3017 decrement(cnt2); 3018 jccb(Assembler::notZero, COPY_STR); 3019 3020 if (int_cnt2 < 0) { // not constant 3021 pop(cnt2); 3022 } 3023 movptr(str1, rsp); // New string address 3024 3025 bind(BIG_STRINGS); 3026 // Load substring. 3027 if (int_cnt2 < 0) { // -1 3028 if (ae == StrIntrinsicNode::UL) { 3029 pmovzxbw(vec, Address(str2, 0)); 3030 } else { 3031 movdqu(vec, Address(str2, 0)); 3032 } 3033 push(cnt2); // substr count 3034 push(str2); // substr addr 3035 push(str1); // string addr 3036 } else { 3037 // Small (< 8 chars) constant substrings are loaded already. 3038 movl(cnt2, int_cnt2); 3039 } 3040 push(tmp); // original SP 3041 3042 } // Finished loading 3043 3044 //======================================================== 3045 // Start search 3046 // 3047 3048 movptr(result, str1); // string addr 3049 3050 if (int_cnt2 < 0) { // Only for non constant substring 3051 jmpb(SCAN_TO_SUBSTR); 3052 3053 // SP saved at sp+0 3054 // String saved at sp+1*wordSize 3055 // Substr saved at sp+2*wordSize 3056 // Substr count saved at sp+3*wordSize 3057 3058 // Reload substr for rescan, this code 3059 // is executed only for large substrings (> 8 chars) 3060 bind(RELOAD_SUBSTR); 3061 movptr(str2, Address(rsp, 2*wordSize)); 3062 movl(cnt2, Address(rsp, 3*wordSize)); 3063 if (ae == StrIntrinsicNode::UL) { 3064 pmovzxbw(vec, Address(str2, 0)); 3065 } else { 3066 movdqu(vec, Address(str2, 0)); 3067 } 3068 // We came here after the beginning of the substring was 3069 // matched but the rest of it was not so we need to search 3070 // again. Start from the next element after the previous match. 3071 subptr(str1, result); // Restore counter 3072 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3073 shrl(str1, 1); 3074 } 3075 addl(cnt1, str1); 3076 decrementl(cnt1); // Shift to next element 3077 cmpl(cnt1, cnt2); 3078 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3079 3080 addptr(result, (1<<scale1)); 3081 } // non constant 3082 3083 // Scan string for start of substr in 16-byte vectors 3084 bind(SCAN_TO_SUBSTR); 3085 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3086 pcmpestri(vec, Address(result, 0), mode); 3087 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3088 subl(cnt1, stride); 3089 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3090 cmpl(cnt1, cnt2); 3091 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3092 addptr(result, 16); 3093 3094 bind(ADJUST_STR); 3095 cmpl(cnt1, stride); // Do not read beyond string 3096 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3097 // Back-up string to avoid reading beyond string. 3098 lea(result, Address(result, cnt1, scale1, -16)); 3099 movl(cnt1, stride); 3100 jmpb(SCAN_TO_SUBSTR); 3101 3102 // Found a potential substr 3103 bind(FOUND_CANDIDATE); 3104 // After pcmpestri tmp(rcx) contains matched element index 3105 3106 // Make sure string is still long enough 3107 subl(cnt1, tmp); 3108 cmpl(cnt1, cnt2); 3109 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3110 // Left less then substring. 3111 3112 bind(RET_NOT_FOUND); 3113 movl(result, -1); 3114 jmp(CLEANUP); 3115 3116 bind(FOUND_SUBSTR); 3117 // Compute start addr of substr 3118 lea(result, Address(result, tmp, scale1)); 3119 if (int_cnt2 > 0) { // Constant substring 3120 // Repeat search for small substring (< 8 chars) 3121 // from new point without reloading substring. 3122 // Have to check that we don't read beyond string. 3123 cmpl(tmp, stride-int_cnt2); 3124 jccb(Assembler::greater, ADJUST_STR); 3125 // Fall through if matched whole substring. 3126 } else { // non constant 3127 assert(int_cnt2 == -1, "should be != 0"); 3128 3129 addl(tmp, cnt2); 3130 // Found result if we matched whole substring. 3131 cmpl(tmp, stride); 3132 jcc(Assembler::lessEqual, RET_FOUND); 3133 3134 // Repeat search for small substring (<= 8 chars) 3135 // from new point 'str1' without reloading substring. 3136 cmpl(cnt2, stride); 3137 // Have to check that we don't read beyond string. 3138 jccb(Assembler::lessEqual, ADJUST_STR); 3139 3140 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3141 // Compare the rest of substring (> 8 chars). 3142 movptr(str1, result); 3143 3144 cmpl(tmp, cnt2); 3145 // First 8 chars are already matched. 3146 jccb(Assembler::equal, CHECK_NEXT); 3147 3148 bind(SCAN_SUBSTR); 3149 pcmpestri(vec, Address(str1, 0), mode); 3150 // Need to reload strings pointers if not matched whole vector 3151 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3152 3153 bind(CHECK_NEXT); 3154 subl(cnt2, stride); 3155 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3156 addptr(str1, 16); 3157 if (ae == StrIntrinsicNode::UL) { 3158 addptr(str2, 8); 3159 } else { 3160 addptr(str2, 16); 3161 } 3162 subl(cnt1, stride); 3163 cmpl(cnt2, stride); // Do not read beyond substring 3164 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3165 // Back-up strings to avoid reading beyond substring. 3166 3167 if (ae == StrIntrinsicNode::UL) { 3168 lea(str2, Address(str2, cnt2, scale2, -8)); 3169 lea(str1, Address(str1, cnt2, scale1, -16)); 3170 } else { 3171 lea(str2, Address(str2, cnt2, scale2, -16)); 3172 lea(str1, Address(str1, cnt2, scale1, -16)); 3173 } 3174 subl(cnt1, cnt2); 3175 movl(cnt2, stride); 3176 addl(cnt1, stride); 3177 bind(CONT_SCAN_SUBSTR); 3178 if (ae == StrIntrinsicNode::UL) { 3179 pmovzxbw(vec, Address(str2, 0)); 3180 } else { 3181 movdqu(vec, Address(str2, 0)); 3182 } 3183 jmp(SCAN_SUBSTR); 3184 3185 bind(RET_FOUND_LONG); 3186 movptr(str1, Address(rsp, wordSize)); 3187 } // non constant 3188 3189 bind(RET_FOUND); 3190 // Compute substr offset 3191 subptr(result, str1); 3192 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3193 shrl(result, 1); // index 3194 } 3195 bind(CLEANUP); 3196 pop(rsp); // restore SP 3197 3198 } // string_indexof 3199 3200 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3201 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3202 ShortBranchVerifier sbv(this); 3203 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3204 3205 int stride = 8; 3206 3207 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3208 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3209 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3210 FOUND_SEQ_CHAR, DONE_LABEL; 3211 3212 movptr(result, str1); 3213 if (UseAVX >= 2) { 3214 cmpl(cnt1, stride); 3215 jcc(Assembler::less, SCAN_TO_CHAR); 3216 cmpl(cnt1, 2*stride); 3217 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3218 movdl(vec1, ch); 3219 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3220 vpxor(vec2, vec2); 3221 movl(tmp, cnt1); 3222 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3223 andl(cnt1,0x0000000F); //tail count (in chars) 3224 3225 bind(SCAN_TO_16_CHAR_LOOP); 3226 vmovdqu(vec3, Address(result, 0)); 3227 vpcmpeqw(vec3, vec3, vec1, 1); 3228 vptest(vec2, vec3); 3229 jcc(Assembler::carryClear, FOUND_CHAR); 3230 addptr(result, 32); 3231 subl(tmp, 2*stride); 3232 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3233 jmp(SCAN_TO_8_CHAR); 3234 bind(SCAN_TO_8_CHAR_INIT); 3235 movdl(vec1, ch); 3236 pshuflw(vec1, vec1, 0x00); 3237 pshufd(vec1, vec1, 0); 3238 pxor(vec2, vec2); 3239 } 3240 bind(SCAN_TO_8_CHAR); 3241 cmpl(cnt1, stride); 3242 jcc(Assembler::less, SCAN_TO_CHAR); 3243 if (UseAVX < 2) { 3244 movdl(vec1, ch); 3245 pshuflw(vec1, vec1, 0x00); 3246 pshufd(vec1, vec1, 0); 3247 pxor(vec2, vec2); 3248 } 3249 movl(tmp, cnt1); 3250 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3251 andl(cnt1,0x00000007); //tail count (in chars) 3252 3253 bind(SCAN_TO_8_CHAR_LOOP); 3254 movdqu(vec3, Address(result, 0)); 3255 pcmpeqw(vec3, vec1); 3256 ptest(vec2, vec3); 3257 jcc(Assembler::carryClear, FOUND_CHAR); 3258 addptr(result, 16); 3259 subl(tmp, stride); 3260 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3261 bind(SCAN_TO_CHAR); 3262 testl(cnt1, cnt1); 3263 jcc(Assembler::zero, RET_NOT_FOUND); 3264 bind(SCAN_TO_CHAR_LOOP); 3265 load_unsigned_short(tmp, Address(result, 0)); 3266 cmpl(ch, tmp); 3267 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3268 addptr(result, 2); 3269 subl(cnt1, 1); 3270 jccb(Assembler::zero, RET_NOT_FOUND); 3271 jmp(SCAN_TO_CHAR_LOOP); 3272 3273 bind(RET_NOT_FOUND); 3274 movl(result, -1); 3275 jmpb(DONE_LABEL); 3276 3277 bind(FOUND_CHAR); 3278 if (UseAVX >= 2) { 3279 vpmovmskb(tmp, vec3); 3280 } else { 3281 pmovmskb(tmp, vec3); 3282 } 3283 bsfl(ch, tmp); 3284 addptr(result, ch); 3285 3286 bind(FOUND_SEQ_CHAR); 3287 subptr(result, str1); 3288 shrl(result, 1); 3289 3290 bind(DONE_LABEL); 3291 } // string_indexof_char 3292 3293 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3294 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3295 ShortBranchVerifier sbv(this); 3296 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3297 3298 int stride = 16; 3299 3300 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3301 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3302 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3303 FOUND_SEQ_CHAR, DONE_LABEL; 3304 3305 movptr(result, str1); 3306 if (UseAVX >= 2) { 3307 cmpl(cnt1, stride); 3308 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3309 cmpl(cnt1, stride*2); 3310 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3311 movdl(vec1, ch); 3312 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3313 vpxor(vec2, vec2); 3314 movl(tmp, cnt1); 3315 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3316 andl(cnt1,0x0000001F); //tail count (in chars) 3317 3318 bind(SCAN_TO_32_CHAR_LOOP); 3319 vmovdqu(vec3, Address(result, 0)); 3320 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3321 vptest(vec2, vec3); 3322 jcc(Assembler::carryClear, FOUND_CHAR); 3323 addptr(result, 32); 3324 subl(tmp, stride*2); 3325 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3326 jmp(SCAN_TO_16_CHAR); 3327 3328 bind(SCAN_TO_16_CHAR_INIT); 3329 movdl(vec1, ch); 3330 pxor(vec2, vec2); 3331 pshufb(vec1, vec2); 3332 } 3333 3334 bind(SCAN_TO_16_CHAR); 3335 cmpl(cnt1, stride); 3336 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3337 if (UseAVX < 2) { 3338 movdl(vec1, ch); 3339 pxor(vec2, vec2); 3340 pshufb(vec1, vec2); 3341 } 3342 movl(tmp, cnt1); 3343 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3344 andl(cnt1,0x0000000F); //tail count (in bytes) 3345 3346 bind(SCAN_TO_16_CHAR_LOOP); 3347 movdqu(vec3, Address(result, 0)); 3348 pcmpeqb(vec3, vec1); 3349 ptest(vec2, vec3); 3350 jcc(Assembler::carryClear, FOUND_CHAR); 3351 addptr(result, 16); 3352 subl(tmp, stride); 3353 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3354 3355 bind(SCAN_TO_CHAR_INIT); 3356 testl(cnt1, cnt1); 3357 jcc(Assembler::zero, RET_NOT_FOUND); 3358 bind(SCAN_TO_CHAR_LOOP); 3359 load_unsigned_byte(tmp, Address(result, 0)); 3360 cmpl(ch, tmp); 3361 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3362 addptr(result, 1); 3363 subl(cnt1, 1); 3364 jccb(Assembler::zero, RET_NOT_FOUND); 3365 jmp(SCAN_TO_CHAR_LOOP); 3366 3367 bind(RET_NOT_FOUND); 3368 movl(result, -1); 3369 jmpb(DONE_LABEL); 3370 3371 bind(FOUND_CHAR); 3372 if (UseAVX >= 2) { 3373 vpmovmskb(tmp, vec3); 3374 } else { 3375 pmovmskb(tmp, vec3); 3376 } 3377 bsfl(ch, tmp); 3378 addptr(result, ch); 3379 3380 bind(FOUND_SEQ_CHAR); 3381 subptr(result, str1); 3382 3383 bind(DONE_LABEL); 3384 } // stringL_indexof_char 3385 3386 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3387 switch (eltype) { 3388 case T_BOOLEAN: return sizeof(jboolean); 3389 case T_BYTE: return sizeof(jbyte); 3390 case T_SHORT: return sizeof(jshort); 3391 case T_CHAR: return sizeof(jchar); 3392 case T_INT: return sizeof(jint); 3393 default: 3394 ShouldNotReachHere(); 3395 return -1; 3396 } 3397 } 3398 3399 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3400 switch (eltype) { 3401 // T_BOOLEAN used as surrogate for unsigned byte 3402 case T_BOOLEAN: movzbl(dst, src); break; 3403 case T_BYTE: movsbl(dst, src); break; 3404 case T_SHORT: movswl(dst, src); break; 3405 case T_CHAR: movzwl(dst, src); break; 3406 case T_INT: movl(dst, src); break; 3407 default: 3408 ShouldNotReachHere(); 3409 } 3410 } 3411 3412 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3413 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3414 } 3415 3416 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3417 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3418 } 3419 3420 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3421 const int vlen = Assembler::AVX_256bit; 3422 switch (eltype) { 3423 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3424 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3425 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3426 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3427 case T_INT: 3428 // do nothing 3429 break; 3430 default: 3431 ShouldNotReachHere(); 3432 } 3433 } 3434 3435 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3436 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3437 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3438 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3439 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3440 BasicType eltype) { 3441 ShortBranchVerifier sbv(this); 3442 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3443 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3444 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3445 3446 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3447 SHORT_UNROLLED_LOOP_EXIT, 3448 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3449 UNROLLED_VECTOR_LOOP_BEGIN, 3450 END; 3451 switch (eltype) { 3452 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3453 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3454 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3455 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3456 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3457 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3458 } 3459 3460 // For "renaming" for readibility of the code 3461 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3462 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3463 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3464 3465 const int elsize = arrays_hashcode_elsize(eltype); 3466 3467 /* 3468 if (cnt1 >= 2) { 3469 if (cnt1 >= 32) { 3470 UNROLLED VECTOR LOOP 3471 } 3472 UNROLLED SCALAR LOOP 3473 } 3474 SINGLE SCALAR 3475 */ 3476 3477 cmpl(cnt1, 32); 3478 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3479 3480 // cnt1 >= 32 && generate_vectorized_loop 3481 xorl(index, index); 3482 3483 // vresult = IntVector.zero(I256); 3484 for (int idx = 0; idx < 4; idx++) { 3485 vpxor(vresult[idx], vresult[idx]); 3486 } 3487 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3488 Register bound = tmp2; 3489 Register next = tmp3; 3490 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3491 movl(next, Address(tmp2, 0)); 3492 movdl(vnext, next); 3493 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3494 3495 // index = 0; 3496 // bound = cnt1 & ~(32 - 1); 3497 movl(bound, cnt1); 3498 andl(bound, ~(32 - 1)); 3499 // for (; index < bound; index += 32) { 3500 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3501 // result *= next; 3502 imull(result, next); 3503 // loop fission to upfront the cost of fetching from memory, OOO execution 3504 // can then hopefully do a better job of prefetching 3505 for (int idx = 0; idx < 4; idx++) { 3506 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3507 } 3508 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3509 for (int idx = 0; idx < 4; idx++) { 3510 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3511 arrays_hashcode_elvcast(vtmp[idx], eltype); 3512 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3513 } 3514 // index += 32; 3515 addl(index, 32); 3516 // index < bound; 3517 cmpl(index, bound); 3518 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3519 // } 3520 3521 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3522 subl(cnt1, bound); 3523 // release bound 3524 3525 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3526 for (int idx = 0; idx < 4; idx++) { 3527 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3528 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3529 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3530 } 3531 // result += vresult.reduceLanes(ADD); 3532 for (int idx = 0; idx < 4; idx++) { 3533 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3534 } 3535 3536 // } else if (cnt1 < 32) { 3537 3538 bind(SHORT_UNROLLED_BEGIN); 3539 // int i = 1; 3540 movl(index, 1); 3541 cmpl(index, cnt1); 3542 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3543 3544 // for (; i < cnt1 ; i += 2) { 3545 bind(SHORT_UNROLLED_LOOP_BEGIN); 3546 movl(tmp3, 961); 3547 imull(result, tmp3); 3548 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3549 movl(tmp3, tmp2); 3550 shll(tmp3, 5); 3551 subl(tmp3, tmp2); 3552 addl(result, tmp3); 3553 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3554 addl(result, tmp3); 3555 addl(index, 2); 3556 cmpl(index, cnt1); 3557 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3558 3559 // } 3560 // if (i >= cnt1) { 3561 bind(SHORT_UNROLLED_LOOP_EXIT); 3562 jccb(Assembler::greater, END); 3563 movl(tmp2, result); 3564 shll(result, 5); 3565 subl(result, tmp2); 3566 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3567 addl(result, tmp3); 3568 // } 3569 bind(END); 3570 3571 BLOCK_COMMENT("} // arrays_hashcode"); 3572 3573 } // arrays_hashcode 3574 3575 // helper function for string_compare 3576 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3577 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3578 Address::ScaleFactor scale2, Register index, int ae) { 3579 if (ae == StrIntrinsicNode::LL) { 3580 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3581 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3582 } else if (ae == StrIntrinsicNode::UU) { 3583 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3584 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3585 } else { 3586 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3587 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3588 } 3589 } 3590 3591 // Compare strings, used for char[] and byte[]. 3592 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3593 Register cnt1, Register cnt2, Register result, 3594 XMMRegister vec1, int ae, KRegister mask) { 3595 ShortBranchVerifier sbv(this); 3596 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3597 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3 3598 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3599 int stride2x2 = 0x40; 3600 Address::ScaleFactor scale = Address::no_scale; 3601 Address::ScaleFactor scale1 = Address::no_scale; 3602 Address::ScaleFactor scale2 = Address::no_scale; 3603 3604 if (ae != StrIntrinsicNode::LL) { 3605 stride2x2 = 0x20; 3606 } 3607 3608 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3609 shrl(cnt2, 1); 3610 } 3611 // Compute the minimum of the string lengths and the 3612 // difference of the string lengths (stack). 3613 // Do the conditional move stuff 3614 movl(result, cnt1); 3615 subl(cnt1, cnt2); 3616 push(cnt1); 3617 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3618 3619 // Is the minimum length zero? 3620 testl(cnt2, cnt2); 3621 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3622 if (ae == StrIntrinsicNode::LL) { 3623 // Load first bytes 3624 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3625 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3626 } else if (ae == StrIntrinsicNode::UU) { 3627 // Load first characters 3628 load_unsigned_short(result, Address(str1, 0)); 3629 load_unsigned_short(cnt1, Address(str2, 0)); 3630 } else { 3631 load_unsigned_byte(result, Address(str1, 0)); 3632 load_unsigned_short(cnt1, Address(str2, 0)); 3633 } 3634 subl(result, cnt1); 3635 jcc(Assembler::notZero, POP_LABEL); 3636 3637 if (ae == StrIntrinsicNode::UU) { 3638 // Divide length by 2 to get number of chars 3639 shrl(cnt2, 1); 3640 } 3641 cmpl(cnt2, 1); 3642 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3643 3644 // Check if the strings start at the same location and setup scale and stride 3645 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3646 cmpptr(str1, str2); 3647 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3648 if (ae == StrIntrinsicNode::LL) { 3649 scale = Address::times_1; 3650 stride = 16; 3651 } else { 3652 scale = Address::times_2; 3653 stride = 8; 3654 } 3655 } else { 3656 scale1 = Address::times_1; 3657 scale2 = Address::times_2; 3658 // scale not used 3659 stride = 8; 3660 } 3661 3662 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3663 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3664 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3665 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3666 Label COMPARE_TAIL_LONG; 3667 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3 3668 3669 int pcmpmask = 0x19; 3670 if (ae == StrIntrinsicNode::LL) { 3671 pcmpmask &= ~0x01; 3672 } 3673 3674 // Setup to compare 16-chars (32-bytes) vectors, 3675 // start from first character again because it has aligned address. 3676 if (ae == StrIntrinsicNode::LL) { 3677 stride2 = 32; 3678 } else { 3679 stride2 = 16; 3680 } 3681 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3682 adr_stride = stride << scale; 3683 } else { 3684 adr_stride1 = 8; //stride << scale1; 3685 adr_stride2 = 16; //stride << scale2; 3686 } 3687 3688 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3689 // rax and rdx are used by pcmpestri as elements counters 3690 movl(result, cnt2); 3691 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3692 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3693 3694 // fast path : compare first 2 8-char vectors. 3695 bind(COMPARE_16_CHARS); 3696 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3697 movdqu(vec1, Address(str1, 0)); 3698 } else { 3699 pmovzxbw(vec1, Address(str1, 0)); 3700 } 3701 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3702 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3703 3704 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3705 movdqu(vec1, Address(str1, adr_stride)); 3706 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3707 } else { 3708 pmovzxbw(vec1, Address(str1, adr_stride1)); 3709 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3710 } 3711 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3712 addl(cnt1, stride); 3713 3714 // Compare the characters at index in cnt1 3715 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3716 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3717 subl(result, cnt2); 3718 jmp(POP_LABEL); 3719 3720 // Setup the registers to start vector comparison loop 3721 bind(COMPARE_WIDE_VECTORS); 3722 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3723 lea(str1, Address(str1, result, scale)); 3724 lea(str2, Address(str2, result, scale)); 3725 } else { 3726 lea(str1, Address(str1, result, scale1)); 3727 lea(str2, Address(str2, result, scale2)); 3728 } 3729 subl(result, stride2); 3730 subl(cnt2, stride2); 3731 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3732 negptr(result); 3733 3734 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3735 bind(COMPARE_WIDE_VECTORS_LOOP); 3736 3737 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3738 cmpl(cnt2, stride2x2); 3739 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3740 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3741 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3742 3743 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3744 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3745 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3746 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3747 } else { 3748 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3749 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3750 } 3751 kortestql(mask, mask); 3752 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3753 addptr(result, stride2x2); // update since we already compared at this addr 3754 subl(cnt2, stride2x2); // and sub the size too 3755 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3756 3757 vpxor(vec1, vec1); 3758 jmpb(COMPARE_WIDE_TAIL); 3759 }//if (VM_Version::supports_avx512vlbw()) 3760 3761 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3762 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3763 vmovdqu(vec1, Address(str1, result, scale)); 3764 vpxor(vec1, Address(str2, result, scale)); 3765 } else { 3766 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3767 vpxor(vec1, Address(str2, result, scale2)); 3768 } 3769 vptest(vec1, vec1); 3770 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3771 addptr(result, stride2); 3772 subl(cnt2, stride2); 3773 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3774 // clean upper bits of YMM registers 3775 vpxor(vec1, vec1); 3776 3777 // compare wide vectors tail 3778 bind(COMPARE_WIDE_TAIL); 3779 testptr(result, result); 3780 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3781 3782 movl(result, stride2); 3783 movl(cnt2, result); 3784 negptr(result); 3785 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3786 3787 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3788 bind(VECTOR_NOT_EQUAL); 3789 // clean upper bits of YMM registers 3790 vpxor(vec1, vec1); 3791 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3792 lea(str1, Address(str1, result, scale)); 3793 lea(str2, Address(str2, result, scale)); 3794 } else { 3795 lea(str1, Address(str1, result, scale1)); 3796 lea(str2, Address(str2, result, scale2)); 3797 } 3798 jmp(COMPARE_16_CHARS); 3799 3800 // Compare tail chars, length between 1 to 15 chars 3801 bind(COMPARE_TAIL_LONG); 3802 movl(cnt2, result); 3803 cmpl(cnt2, stride); 3804 jcc(Assembler::less, COMPARE_SMALL_STR); 3805 3806 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3807 movdqu(vec1, Address(str1, 0)); 3808 } else { 3809 pmovzxbw(vec1, Address(str1, 0)); 3810 } 3811 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3812 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3813 subptr(cnt2, stride); 3814 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3815 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3816 lea(str1, Address(str1, result, scale)); 3817 lea(str2, Address(str2, result, scale)); 3818 } else { 3819 lea(str1, Address(str1, result, scale1)); 3820 lea(str2, Address(str2, result, scale2)); 3821 } 3822 negptr(cnt2); 3823 jmpb(WHILE_HEAD_LABEL); 3824 3825 bind(COMPARE_SMALL_STR); 3826 } else if (UseSSE42Intrinsics) { 3827 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3828 int pcmpmask = 0x19; 3829 // Setup to compare 8-char (16-byte) vectors, 3830 // start from first character again because it has aligned address. 3831 movl(result, cnt2); 3832 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3833 if (ae == StrIntrinsicNode::LL) { 3834 pcmpmask &= ~0x01; 3835 } 3836 jcc(Assembler::zero, COMPARE_TAIL); 3837 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3838 lea(str1, Address(str1, result, scale)); 3839 lea(str2, Address(str2, result, scale)); 3840 } else { 3841 lea(str1, Address(str1, result, scale1)); 3842 lea(str2, Address(str2, result, scale2)); 3843 } 3844 negptr(result); 3845 3846 // pcmpestri 3847 // inputs: 3848 // vec1- substring 3849 // rax - negative string length (elements count) 3850 // mem - scanned string 3851 // rdx - string length (elements count) 3852 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3853 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3854 // outputs: 3855 // rcx - first mismatched element index 3856 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3857 3858 bind(COMPARE_WIDE_VECTORS); 3859 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3860 movdqu(vec1, Address(str1, result, scale)); 3861 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3862 } else { 3863 pmovzxbw(vec1, Address(str1, result, scale1)); 3864 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3865 } 3866 // After pcmpestri cnt1(rcx) contains mismatched element index 3867 3868 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3869 addptr(result, stride); 3870 subptr(cnt2, stride); 3871 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3872 3873 // compare wide vectors tail 3874 testptr(result, result); 3875 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3876 3877 movl(cnt2, stride); 3878 movl(result, stride); 3879 negptr(result); 3880 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3881 movdqu(vec1, Address(str1, result, scale)); 3882 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3883 } else { 3884 pmovzxbw(vec1, Address(str1, result, scale1)); 3885 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3886 } 3887 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3888 3889 // Mismatched characters in the vectors 3890 bind(VECTOR_NOT_EQUAL); 3891 addptr(cnt1, result); 3892 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3893 subl(result, cnt2); 3894 jmpb(POP_LABEL); 3895 3896 bind(COMPARE_TAIL); // limit is zero 3897 movl(cnt2, result); 3898 // Fallthru to tail compare 3899 } 3900 // Shift str2 and str1 to the end of the arrays, negate min 3901 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3902 lea(str1, Address(str1, cnt2, scale)); 3903 lea(str2, Address(str2, cnt2, scale)); 3904 } else { 3905 lea(str1, Address(str1, cnt2, scale1)); 3906 lea(str2, Address(str2, cnt2, scale2)); 3907 } 3908 decrementl(cnt2); // first character was compared already 3909 negptr(cnt2); 3910 3911 // Compare the rest of the elements 3912 bind(WHILE_HEAD_LABEL); 3913 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3914 subl(result, cnt1); 3915 jccb(Assembler::notZero, POP_LABEL); 3916 increment(cnt2); 3917 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3918 3919 // Strings are equal up to min length. Return the length difference. 3920 bind(LENGTH_DIFF_LABEL); 3921 pop(result); 3922 if (ae == StrIntrinsicNode::UU) { 3923 // Divide diff by 2 to get number of chars 3924 sarl(result, 1); 3925 } 3926 jmpb(DONE_LABEL); 3927 3928 if (VM_Version::supports_avx512vlbw()) { 3929 3930 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3931 3932 kmovql(cnt1, mask); 3933 notq(cnt1); 3934 bsfq(cnt2, cnt1); 3935 if (ae != StrIntrinsicNode::LL) { 3936 // Divide diff by 2 to get number of chars 3937 sarl(cnt2, 1); 3938 } 3939 addq(result, cnt2); 3940 if (ae == StrIntrinsicNode::LL) { 3941 load_unsigned_byte(cnt1, Address(str2, result)); 3942 load_unsigned_byte(result, Address(str1, result)); 3943 } else if (ae == StrIntrinsicNode::UU) { 3944 load_unsigned_short(cnt1, Address(str2, result, scale)); 3945 load_unsigned_short(result, Address(str1, result, scale)); 3946 } else { 3947 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3948 load_unsigned_byte(result, Address(str1, result, scale1)); 3949 } 3950 subl(result, cnt1); 3951 jmpb(POP_LABEL); 3952 }//if (VM_Version::supports_avx512vlbw()) 3953 3954 // Discard the stored length difference 3955 bind(POP_LABEL); 3956 pop(cnt1); 3957 3958 // That's it 3959 bind(DONE_LABEL); 3960 if(ae == StrIntrinsicNode::UL) { 3961 negl(result); 3962 } 3963 3964 } 3965 3966 // Search for Non-ASCII character (Negative byte value) in a byte array, 3967 // return the index of the first such character, otherwise the length 3968 // of the array segment searched. 3969 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3970 // @IntrinsicCandidate 3971 // public static int countPositives(byte[] ba, int off, int len) { 3972 // for (int i = off; i < off + len; i++) { 3973 // if (ba[i] < 0) { 3974 // return i - off; 3975 // } 3976 // } 3977 // return len; 3978 // } 3979 void C2_MacroAssembler::count_positives(Register ary1, Register len, 3980 Register result, Register tmp1, 3981 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3982 // rsi: byte array 3983 // rcx: len 3984 // rax: result 3985 ShortBranchVerifier sbv(this); 3986 assert_different_registers(ary1, len, result, tmp1); 3987 assert_different_registers(vec1, vec2); 3988 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3989 3990 movl(result, len); // copy 3991 // len == 0 3992 testl(len, len); 3993 jcc(Assembler::zero, DONE); 3994 3995 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3996 VM_Version::supports_avx512vlbw() && 3997 VM_Version::supports_bmi2()) { 3998 3999 Label test_64_loop, test_tail, BREAK_LOOP; 4000 movl(tmp1, len); 4001 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4002 4003 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4004 andl(len, 0xffffffc0); // vector count (in chars) 4005 jccb(Assembler::zero, test_tail); 4006 4007 lea(ary1, Address(ary1, len, Address::times_1)); 4008 negptr(len); 4009 4010 bind(test_64_loop); 4011 // Check whether our 64 elements of size byte contain negatives 4012 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4013 kortestql(mask1, mask1); 4014 jcc(Assembler::notZero, BREAK_LOOP); 4015 4016 addptr(len, 64); 4017 jccb(Assembler::notZero, test_64_loop); 4018 4019 bind(test_tail); 4020 // bail out when there is nothing to be done 4021 testl(tmp1, -1); 4022 jcc(Assembler::zero, DONE); 4023 4024 4025 // check the tail for absense of negatives 4026 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4027 { 4028 Register tmp3_aliased = len; 4029 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4030 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4031 notq(tmp3_aliased); 4032 kmovql(mask2, tmp3_aliased); 4033 } 4034 4035 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4036 ktestq(mask1, mask2); 4037 jcc(Assembler::zero, DONE); 4038 4039 // do a full check for negative registers in the tail 4040 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4041 // ary1 already pointing to the right place 4042 jmpb(TAIL_START); 4043 4044 bind(BREAK_LOOP); 4045 // At least one byte in the last 64 byte block was negative. 4046 // Set up to look at the last 64 bytes as if they were a tail 4047 lea(ary1, Address(ary1, len, Address::times_1)); 4048 addptr(result, len); 4049 // Ignore the very last byte: if all others are positive, 4050 // it must be negative, so we can skip right to the 2+1 byte 4051 // end comparison at this point 4052 orl(result, 63); 4053 movl(len, 63); 4054 // Fallthru to tail compare 4055 } else { 4056 4057 if (UseAVX >= 2) { 4058 // With AVX2, use 32-byte vector compare 4059 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4060 4061 // Compare 32-byte vectors 4062 testl(len, 0xffffffe0); // vector count (in bytes) 4063 jccb(Assembler::zero, TAIL_START); 4064 4065 andl(len, 0xffffffe0); 4066 lea(ary1, Address(ary1, len, Address::times_1)); 4067 negptr(len); 4068 4069 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4070 movdl(vec2, tmp1); 4071 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4072 4073 bind(COMPARE_WIDE_VECTORS); 4074 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4075 vptest(vec1, vec2); 4076 jccb(Assembler::notZero, BREAK_LOOP); 4077 addptr(len, 32); 4078 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4079 4080 testl(result, 0x0000001f); // any bytes remaining? 4081 jcc(Assembler::zero, DONE); 4082 4083 // Quick test using the already prepared vector mask 4084 movl(len, result); 4085 andl(len, 0x0000001f); 4086 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4087 vptest(vec1, vec2); 4088 jcc(Assembler::zero, DONE); 4089 // There are zeros, jump to the tail to determine exactly where 4090 jmpb(TAIL_START); 4091 4092 bind(BREAK_LOOP); 4093 // At least one byte in the last 32-byte vector is negative. 4094 // Set up to look at the last 32 bytes as if they were a tail 4095 lea(ary1, Address(ary1, len, Address::times_1)); 4096 addptr(result, len); 4097 // Ignore the very last byte: if all others are positive, 4098 // it must be negative, so we can skip right to the 2+1 byte 4099 // end comparison at this point 4100 orl(result, 31); 4101 movl(len, 31); 4102 // Fallthru to tail compare 4103 } else if (UseSSE42Intrinsics) { 4104 // With SSE4.2, use double quad vector compare 4105 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4106 4107 // Compare 16-byte vectors 4108 testl(len, 0xfffffff0); // vector count (in bytes) 4109 jcc(Assembler::zero, TAIL_START); 4110 4111 andl(len, 0xfffffff0); 4112 lea(ary1, Address(ary1, len, Address::times_1)); 4113 negptr(len); 4114 4115 movl(tmp1, 0x80808080); 4116 movdl(vec2, tmp1); 4117 pshufd(vec2, vec2, 0); 4118 4119 bind(COMPARE_WIDE_VECTORS); 4120 movdqu(vec1, Address(ary1, len, Address::times_1)); 4121 ptest(vec1, vec2); 4122 jccb(Assembler::notZero, BREAK_LOOP); 4123 addptr(len, 16); 4124 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4125 4126 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4127 jcc(Assembler::zero, DONE); 4128 4129 // Quick test using the already prepared vector mask 4130 movl(len, result); 4131 andl(len, 0x0000000f); // tail count (in bytes) 4132 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4133 ptest(vec1, vec2); 4134 jcc(Assembler::zero, DONE); 4135 jmpb(TAIL_START); 4136 4137 bind(BREAK_LOOP); 4138 // At least one byte in the last 16-byte vector is negative. 4139 // Set up and look at the last 16 bytes as if they were a tail 4140 lea(ary1, Address(ary1, len, Address::times_1)); 4141 addptr(result, len); 4142 // Ignore the very last byte: if all others are positive, 4143 // it must be negative, so we can skip right to the 2+1 byte 4144 // end comparison at this point 4145 orl(result, 15); 4146 movl(len, 15); 4147 // Fallthru to tail compare 4148 } 4149 } 4150 4151 bind(TAIL_START); 4152 // Compare 4-byte vectors 4153 andl(len, 0xfffffffc); // vector count (in bytes) 4154 jccb(Assembler::zero, COMPARE_CHAR); 4155 4156 lea(ary1, Address(ary1, len, Address::times_1)); 4157 negptr(len); 4158 4159 bind(COMPARE_VECTORS); 4160 movl(tmp1, Address(ary1, len, Address::times_1)); 4161 andl(tmp1, 0x80808080); 4162 jccb(Assembler::notZero, TAIL_ADJUST); 4163 addptr(len, 4); 4164 jccb(Assembler::notZero, COMPARE_VECTORS); 4165 4166 // Compare trailing char (final 2-3 bytes), if any 4167 bind(COMPARE_CHAR); 4168 4169 testl(result, 0x2); // tail char 4170 jccb(Assembler::zero, COMPARE_BYTE); 4171 load_unsigned_short(tmp1, Address(ary1, 0)); 4172 andl(tmp1, 0x00008080); 4173 jccb(Assembler::notZero, CHAR_ADJUST); 4174 lea(ary1, Address(ary1, 2)); 4175 4176 bind(COMPARE_BYTE); 4177 testl(result, 0x1); // tail byte 4178 jccb(Assembler::zero, DONE); 4179 load_unsigned_byte(tmp1, Address(ary1, 0)); 4180 testl(tmp1, 0x00000080); 4181 jccb(Assembler::zero, DONE); 4182 subptr(result, 1); 4183 jmpb(DONE); 4184 4185 bind(TAIL_ADJUST); 4186 // there are negative bits in the last 4 byte block. 4187 // Adjust result and check the next three bytes 4188 addptr(result, len); 4189 orl(result, 3); 4190 lea(ary1, Address(ary1, len, Address::times_1)); 4191 jmpb(COMPARE_CHAR); 4192 4193 bind(CHAR_ADJUST); 4194 // We are looking at a char + optional byte tail, and found that one 4195 // of the bytes in the char is negative. Adjust the result, check the 4196 // first byte and readjust if needed. 4197 andl(result, 0xfffffffc); 4198 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4199 jccb(Assembler::notZero, DONE); 4200 addptr(result, 1); 4201 4202 // That's it 4203 bind(DONE); 4204 if (UseAVX >= 2) { 4205 // clean upper bits of YMM registers 4206 vpxor(vec1, vec1); 4207 vpxor(vec2, vec2); 4208 } 4209 } 4210 4211 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4212 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4213 Register limit, Register result, Register chr, 4214 XMMRegister vec1, XMMRegister vec2, bool is_char, 4215 KRegister mask, bool expand_ary2) { 4216 // for expand_ary2, limit is the (smaller) size of the second array. 4217 ShortBranchVerifier sbv(this); 4218 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4219 4220 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4221 "Expansion only implemented for AVX2"); 4222 4223 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4224 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4225 4226 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4227 int scaleIncr = expand_ary2 ? 8 : 16; 4228 4229 if (is_array_equ) { 4230 // Check the input args 4231 cmpoop(ary1, ary2); 4232 jcc(Assembler::equal, TRUE_LABEL); 4233 4234 // Need additional checks for arrays_equals. 4235 testptr(ary1, ary1); 4236 jcc(Assembler::zero, FALSE_LABEL); 4237 testptr(ary2, ary2); 4238 jcc(Assembler::zero, FALSE_LABEL); 4239 4240 // Check the lengths 4241 movl(limit, Address(ary1, length_offset)); 4242 cmpl(limit, Address(ary2, length_offset)); 4243 jcc(Assembler::notEqual, FALSE_LABEL); 4244 } 4245 4246 // count == 0 4247 testl(limit, limit); 4248 jcc(Assembler::zero, TRUE_LABEL); 4249 4250 if (is_array_equ) { 4251 // Load array address 4252 lea(ary1, Address(ary1, base_offset)); 4253 lea(ary2, Address(ary2, base_offset)); 4254 } 4255 4256 if (is_array_equ && is_char) { 4257 // arrays_equals when used for char[]. 4258 shll(limit, 1); // byte count != 0 4259 } 4260 movl(result, limit); // copy 4261 4262 if (UseAVX >= 2) { 4263 // With AVX2, use 32-byte vector compare 4264 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4265 4266 // Compare 32-byte vectors 4267 if (expand_ary2) { 4268 andl(result, 0x0000000f); // tail count (in bytes) 4269 andl(limit, 0xfffffff0); // vector count (in bytes) 4270 jcc(Assembler::zero, COMPARE_TAIL); 4271 } else { 4272 andl(result, 0x0000001f); // tail count (in bytes) 4273 andl(limit, 0xffffffe0); // vector count (in bytes) 4274 jcc(Assembler::zero, COMPARE_TAIL_16); 4275 } 4276 4277 lea(ary1, Address(ary1, limit, scaleFactor)); 4278 lea(ary2, Address(ary2, limit, Address::times_1)); 4279 negptr(limit); 4280 4281 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4282 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4283 4284 cmpl(limit, -64); 4285 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4286 4287 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4288 4289 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4290 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4291 kortestql(mask, mask); 4292 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4293 addptr(limit, 64); // update since we already compared at this addr 4294 cmpl(limit, -64); 4295 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4296 4297 // At this point we may still need to compare -limit+result bytes. 4298 // We could execute the next two instruction and just continue via non-wide path: 4299 // cmpl(limit, 0); 4300 // jcc(Assembler::equal, COMPARE_TAIL); // true 4301 // But since we stopped at the points ary{1,2}+limit which are 4302 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4303 // (|limit| <= 32 and result < 32), 4304 // we may just compare the last 64 bytes. 4305 // 4306 addptr(result, -64); // it is safe, bc we just came from this area 4307 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4308 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4309 kortestql(mask, mask); 4310 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4311 4312 jmp(TRUE_LABEL); 4313 4314 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4315 4316 }//if (VM_Version::supports_avx512vlbw()) 4317 4318 bind(COMPARE_WIDE_VECTORS); 4319 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4320 if (expand_ary2) { 4321 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4322 } else { 4323 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4324 } 4325 vpxor(vec1, vec2); 4326 4327 vptest(vec1, vec1); 4328 jcc(Assembler::notZero, FALSE_LABEL); 4329 addptr(limit, scaleIncr * 2); 4330 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4331 4332 testl(result, result); 4333 jcc(Assembler::zero, TRUE_LABEL); 4334 4335 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4336 if (expand_ary2) { 4337 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4338 } else { 4339 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4340 } 4341 vpxor(vec1, vec2); 4342 4343 vptest(vec1, vec1); 4344 jcc(Assembler::notZero, FALSE_LABEL); 4345 jmp(TRUE_LABEL); 4346 4347 bind(COMPARE_TAIL_16); // limit is zero 4348 movl(limit, result); 4349 4350 // Compare 16-byte chunks 4351 andl(result, 0x0000000f); // tail count (in bytes) 4352 andl(limit, 0xfffffff0); // vector count (in bytes) 4353 jcc(Assembler::zero, COMPARE_TAIL); 4354 4355 lea(ary1, Address(ary1, limit, scaleFactor)); 4356 lea(ary2, Address(ary2, limit, Address::times_1)); 4357 negptr(limit); 4358 4359 bind(COMPARE_WIDE_VECTORS_16); 4360 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4361 if (expand_ary2) { 4362 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4363 } else { 4364 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4365 } 4366 pxor(vec1, vec2); 4367 4368 ptest(vec1, vec1); 4369 jcc(Assembler::notZero, FALSE_LABEL); 4370 addptr(limit, scaleIncr); 4371 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4372 4373 bind(COMPARE_TAIL); // limit is zero 4374 movl(limit, result); 4375 // Fallthru to tail compare 4376 } else if (UseSSE42Intrinsics) { 4377 // With SSE4.2, use double quad vector compare 4378 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4379 4380 // Compare 16-byte vectors 4381 andl(result, 0x0000000f); // tail count (in bytes) 4382 andl(limit, 0xfffffff0); // vector count (in bytes) 4383 jcc(Assembler::zero, COMPARE_TAIL); 4384 4385 lea(ary1, Address(ary1, limit, Address::times_1)); 4386 lea(ary2, Address(ary2, limit, Address::times_1)); 4387 negptr(limit); 4388 4389 bind(COMPARE_WIDE_VECTORS); 4390 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4391 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4392 pxor(vec1, vec2); 4393 4394 ptest(vec1, vec1); 4395 jcc(Assembler::notZero, FALSE_LABEL); 4396 addptr(limit, 16); 4397 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4398 4399 testl(result, result); 4400 jcc(Assembler::zero, TRUE_LABEL); 4401 4402 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4403 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4404 pxor(vec1, vec2); 4405 4406 ptest(vec1, vec1); 4407 jccb(Assembler::notZero, FALSE_LABEL); 4408 jmpb(TRUE_LABEL); 4409 4410 bind(COMPARE_TAIL); // limit is zero 4411 movl(limit, result); 4412 // Fallthru to tail compare 4413 } 4414 4415 // Compare 4-byte vectors 4416 if (expand_ary2) { 4417 testl(result, result); 4418 jccb(Assembler::zero, TRUE_LABEL); 4419 } else { 4420 andl(limit, 0xfffffffc); // vector count (in bytes) 4421 jccb(Assembler::zero, COMPARE_CHAR); 4422 } 4423 4424 lea(ary1, Address(ary1, limit, scaleFactor)); 4425 lea(ary2, Address(ary2, limit, Address::times_1)); 4426 negptr(limit); 4427 4428 bind(COMPARE_VECTORS); 4429 if (expand_ary2) { 4430 // There are no "vector" operations for bytes to shorts 4431 movzbl(chr, Address(ary2, limit, Address::times_1)); 4432 cmpw(Address(ary1, limit, Address::times_2), chr); 4433 jccb(Assembler::notEqual, FALSE_LABEL); 4434 addptr(limit, 1); 4435 jcc(Assembler::notZero, COMPARE_VECTORS); 4436 jmp(TRUE_LABEL); 4437 } else { 4438 movl(chr, Address(ary1, limit, Address::times_1)); 4439 cmpl(chr, Address(ary2, limit, Address::times_1)); 4440 jccb(Assembler::notEqual, FALSE_LABEL); 4441 addptr(limit, 4); 4442 jcc(Assembler::notZero, COMPARE_VECTORS); 4443 } 4444 4445 // Compare trailing char (final 2 bytes), if any 4446 bind(COMPARE_CHAR); 4447 testl(result, 0x2); // tail char 4448 jccb(Assembler::zero, COMPARE_BYTE); 4449 load_unsigned_short(chr, Address(ary1, 0)); 4450 load_unsigned_short(limit, Address(ary2, 0)); 4451 cmpl(chr, limit); 4452 jccb(Assembler::notEqual, FALSE_LABEL); 4453 4454 if (is_array_equ && is_char) { 4455 bind(COMPARE_BYTE); 4456 } else { 4457 lea(ary1, Address(ary1, 2)); 4458 lea(ary2, Address(ary2, 2)); 4459 4460 bind(COMPARE_BYTE); 4461 testl(result, 0x1); // tail byte 4462 jccb(Assembler::zero, TRUE_LABEL); 4463 load_unsigned_byte(chr, Address(ary1, 0)); 4464 load_unsigned_byte(limit, Address(ary2, 0)); 4465 cmpl(chr, limit); 4466 jccb(Assembler::notEqual, FALSE_LABEL); 4467 } 4468 bind(TRUE_LABEL); 4469 movl(result, 1); // return true 4470 jmpb(DONE); 4471 4472 bind(FALSE_LABEL); 4473 xorl(result, result); // return false 4474 4475 // That's it 4476 bind(DONE); 4477 if (UseAVX >= 2) { 4478 // clean upper bits of YMM registers 4479 vpxor(vec1, vec1); 4480 vpxor(vec2, vec2); 4481 } 4482 } 4483 4484 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4485 #define __ masm. 4486 Register dst = stub.data<0>(); 4487 XMMRegister src = stub.data<1>(); 4488 address target = stub.data<2>(); 4489 __ bind(stub.entry()); 4490 __ subptr(rsp, 8); 4491 __ movdbl(Address(rsp), src); 4492 __ call(RuntimeAddress(target)); 4493 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte. 4494 __ pop(dst); 4495 __ jmp(stub.continuation()); 4496 #undef __ 4497 } 4498 4499 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4500 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4501 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4502 4503 address slowpath_target; 4504 if (dst_bt == T_INT) { 4505 if (src_bt == T_FLOAT) { 4506 cvttss2sil(dst, src); 4507 cmpl(dst, 0x80000000); 4508 slowpath_target = StubRoutines::x86::f2i_fixup(); 4509 } else { 4510 cvttsd2sil(dst, src); 4511 cmpl(dst, 0x80000000); 4512 slowpath_target = StubRoutines::x86::d2i_fixup(); 4513 } 4514 } else { 4515 if (src_bt == T_FLOAT) { 4516 cvttss2siq(dst, src); 4517 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4518 slowpath_target = StubRoutines::x86::f2l_fixup(); 4519 } else { 4520 cvttsd2siq(dst, src); 4521 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4522 slowpath_target = StubRoutines::x86::d2l_fixup(); 4523 } 4524 } 4525 4526 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte. 4527 int max_size = 23 + (UseAPX ? 1 : 0); 4528 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath); 4529 jcc(Assembler::equal, stub->entry()); 4530 bind(stub->continuation()); 4531 } 4532 4533 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4534 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4535 switch(ideal_opc) { 4536 case Op_LShiftVS: 4537 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4538 case Op_LShiftVI: 4539 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4540 case Op_LShiftVL: 4541 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4542 case Op_RShiftVS: 4543 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4544 case Op_RShiftVI: 4545 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4546 case Op_RShiftVL: 4547 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4548 case Op_URShiftVS: 4549 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4550 case Op_URShiftVI: 4551 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4552 case Op_URShiftVL: 4553 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4554 case Op_RotateRightV: 4555 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4556 case Op_RotateLeftV: 4557 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4558 default: 4559 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4560 break; 4561 } 4562 } 4563 4564 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4565 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4566 if (is_unsigned) { 4567 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4568 } else { 4569 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4570 } 4571 } 4572 4573 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4574 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4575 switch (elem_bt) { 4576 case T_BYTE: 4577 if (ideal_opc == Op_SaturatingAddV) { 4578 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4579 } else { 4580 assert(ideal_opc == Op_SaturatingSubV, ""); 4581 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4582 } 4583 break; 4584 case T_SHORT: 4585 if (ideal_opc == Op_SaturatingAddV) { 4586 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4587 } else { 4588 assert(ideal_opc == Op_SaturatingSubV, ""); 4589 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4590 } 4591 break; 4592 default: 4593 fatal("Unsupported type %s", type2name(elem_bt)); 4594 break; 4595 } 4596 } 4597 4598 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4599 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4600 switch (elem_bt) { 4601 case T_BYTE: 4602 if (ideal_opc == Op_SaturatingAddV) { 4603 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4604 } else { 4605 assert(ideal_opc == Op_SaturatingSubV, ""); 4606 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4607 } 4608 break; 4609 case T_SHORT: 4610 if (ideal_opc == Op_SaturatingAddV) { 4611 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4612 } else { 4613 assert(ideal_opc == Op_SaturatingSubV, ""); 4614 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4615 } 4616 break; 4617 default: 4618 fatal("Unsupported type %s", type2name(elem_bt)); 4619 break; 4620 } 4621 } 4622 4623 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4624 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4625 if (is_unsigned) { 4626 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4627 } else { 4628 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4629 } 4630 } 4631 4632 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4633 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4634 switch (elem_bt) { 4635 case T_BYTE: 4636 if (ideal_opc == Op_SaturatingAddV) { 4637 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4638 } else { 4639 assert(ideal_opc == Op_SaturatingSubV, ""); 4640 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4641 } 4642 break; 4643 case T_SHORT: 4644 if (ideal_opc == Op_SaturatingAddV) { 4645 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4646 } else { 4647 assert(ideal_opc == Op_SaturatingSubV, ""); 4648 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4649 } 4650 break; 4651 default: 4652 fatal("Unsupported type %s", type2name(elem_bt)); 4653 break; 4654 } 4655 } 4656 4657 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4658 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4659 switch (elem_bt) { 4660 case T_BYTE: 4661 if (ideal_opc == Op_SaturatingAddV) { 4662 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4663 } else { 4664 assert(ideal_opc == Op_SaturatingSubV, ""); 4665 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4666 } 4667 break; 4668 case T_SHORT: 4669 if (ideal_opc == Op_SaturatingAddV) { 4670 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4671 } else { 4672 assert(ideal_opc == Op_SaturatingSubV, ""); 4673 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4674 } 4675 break; 4676 default: 4677 fatal("Unsupported type %s", type2name(elem_bt)); 4678 break; 4679 } 4680 } 4681 4682 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4683 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4684 bool is_varshift) { 4685 switch (ideal_opc) { 4686 case Op_AddVB: 4687 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4688 case Op_AddVS: 4689 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4690 case Op_AddVI: 4691 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4692 case Op_AddVL: 4693 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4694 case Op_AddVF: 4695 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4696 case Op_AddVD: 4697 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4698 case Op_SubVB: 4699 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4700 case Op_SubVS: 4701 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4702 case Op_SubVI: 4703 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4704 case Op_SubVL: 4705 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4706 case Op_SubVF: 4707 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4708 case Op_SubVD: 4709 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4710 case Op_MulVS: 4711 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4712 case Op_MulVI: 4713 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4714 case Op_MulVL: 4715 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4716 case Op_MulVF: 4717 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4718 case Op_MulVD: 4719 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4720 case Op_DivVF: 4721 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4722 case Op_DivVD: 4723 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4724 case Op_SqrtVF: 4725 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4726 case Op_SqrtVD: 4727 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4728 case Op_AbsVB: 4729 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4730 case Op_AbsVS: 4731 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4732 case Op_AbsVI: 4733 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4734 case Op_AbsVL: 4735 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4736 case Op_FmaVF: 4737 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4738 case Op_FmaVD: 4739 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4740 case Op_VectorRearrange: 4741 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4742 case Op_LShiftVS: 4743 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4744 case Op_LShiftVI: 4745 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4746 case Op_LShiftVL: 4747 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4748 case Op_RShiftVS: 4749 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4750 case Op_RShiftVI: 4751 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4752 case Op_RShiftVL: 4753 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4754 case Op_URShiftVS: 4755 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4756 case Op_URShiftVI: 4757 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4758 case Op_URShiftVL: 4759 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4760 case Op_RotateLeftV: 4761 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4762 case Op_RotateRightV: 4763 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4764 case Op_MaxV: 4765 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4766 case Op_MinV: 4767 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4768 case Op_UMinV: 4769 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4770 case Op_UMaxV: 4771 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4772 case Op_XorV: 4773 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4774 case Op_OrV: 4775 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4776 case Op_AndV: 4777 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4778 default: 4779 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4780 break; 4781 } 4782 } 4783 4784 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4785 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4786 switch (ideal_opc) { 4787 case Op_AddVB: 4788 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4789 case Op_AddVS: 4790 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4791 case Op_AddVI: 4792 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4793 case Op_AddVL: 4794 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4795 case Op_AddVF: 4796 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4797 case Op_AddVD: 4798 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4799 case Op_SubVB: 4800 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4801 case Op_SubVS: 4802 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4803 case Op_SubVI: 4804 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4805 case Op_SubVL: 4806 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4807 case Op_SubVF: 4808 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4809 case Op_SubVD: 4810 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4811 case Op_MulVS: 4812 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4813 case Op_MulVI: 4814 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4815 case Op_MulVL: 4816 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4817 case Op_MulVF: 4818 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4819 case Op_MulVD: 4820 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4821 case Op_DivVF: 4822 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4823 case Op_DivVD: 4824 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4825 case Op_FmaVF: 4826 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4827 case Op_FmaVD: 4828 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4829 case Op_MaxV: 4830 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4831 case Op_MinV: 4832 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4833 case Op_UMaxV: 4834 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4835 case Op_UMinV: 4836 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4837 case Op_XorV: 4838 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4839 case Op_OrV: 4840 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4841 case Op_AndV: 4842 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4843 default: 4844 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4845 break; 4846 } 4847 } 4848 4849 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4850 KRegister src1, KRegister src2) { 4851 BasicType etype = T_ILLEGAL; 4852 switch(mask_len) { 4853 case 2: 4854 case 4: 4855 case 8: etype = T_BYTE; break; 4856 case 16: etype = T_SHORT; break; 4857 case 32: etype = T_INT; break; 4858 case 64: etype = T_LONG; break; 4859 default: fatal("Unsupported type"); break; 4860 } 4861 assert(etype != T_ILLEGAL, ""); 4862 switch(ideal_opc) { 4863 case Op_AndVMask: 4864 kand(etype, dst, src1, src2); break; 4865 case Op_OrVMask: 4866 kor(etype, dst, src1, src2); break; 4867 case Op_XorVMask: 4868 kxor(etype, dst, src1, src2); break; 4869 default: 4870 fatal("Unsupported masked operation"); break; 4871 } 4872 } 4873 4874 /* 4875 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4876 * If src is NaN, the result is 0. 4877 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4878 * the result is equal to the value of Integer.MIN_VALUE. 4879 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4880 * the result is equal to the value of Integer.MAX_VALUE. 4881 */ 4882 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4883 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4884 Register rscratch, AddressLiteral float_sign_flip, 4885 int vec_enc) { 4886 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4887 Label done; 4888 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4889 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4890 vptest(xtmp2, xtmp2, vec_enc); 4891 jccb(Assembler::equal, done); 4892 4893 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4894 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4895 4896 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4897 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4898 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4899 4900 // Recompute the mask for remaining special value. 4901 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4902 // Extract SRC values corresponding to TRUE mask lanes. 4903 vpand(xtmp4, xtmp2, src, vec_enc); 4904 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4905 // values are set. 4906 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4907 4908 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4909 bind(done); 4910 } 4911 4912 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4913 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4914 Register rscratch, AddressLiteral float_sign_flip, 4915 int vec_enc) { 4916 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4917 Label done; 4918 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4919 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4920 kortestwl(ktmp1, ktmp1); 4921 jccb(Assembler::equal, done); 4922 4923 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4924 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4925 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4926 4927 kxorwl(ktmp1, ktmp1, ktmp2); 4928 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4929 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4930 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4931 bind(done); 4932 } 4933 4934 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4935 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4936 Register rscratch, AddressLiteral double_sign_flip, 4937 int vec_enc) { 4938 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4939 4940 Label done; 4941 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4942 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4943 kortestwl(ktmp1, ktmp1); 4944 jccb(Assembler::equal, done); 4945 4946 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4947 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4948 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4949 4950 kxorwl(ktmp1, ktmp1, ktmp2); 4951 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4952 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4953 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4954 bind(done); 4955 } 4956 4957 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4958 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4959 Register rscratch, AddressLiteral float_sign_flip, 4960 int vec_enc) { 4961 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4962 Label done; 4963 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4964 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4965 kortestwl(ktmp1, ktmp1); 4966 jccb(Assembler::equal, done); 4967 4968 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4969 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4970 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4971 4972 kxorwl(ktmp1, ktmp1, ktmp2); 4973 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4974 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4975 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4976 bind(done); 4977 } 4978 4979 /* 4980 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4981 * If src is NaN, the result is 0. 4982 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4983 * the result is equal to the value of Long.MIN_VALUE. 4984 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4985 * the result is equal to the value of Long.MAX_VALUE. 4986 */ 4987 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4988 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4989 Register rscratch, AddressLiteral double_sign_flip, 4990 int vec_enc) { 4991 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4992 4993 Label done; 4994 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4995 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4996 kortestwl(ktmp1, ktmp1); 4997 jccb(Assembler::equal, done); 4998 4999 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5000 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5001 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5002 5003 kxorwl(ktmp1, ktmp1, ktmp2); 5004 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5005 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5006 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5007 bind(done); 5008 } 5009 5010 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5011 XMMRegister xtmp, int index, int vec_enc) { 5012 assert(vec_enc < Assembler::AVX_512bit, ""); 5013 if (vec_enc == Assembler::AVX_256bit) { 5014 vextractf128_high(xtmp, src); 5015 vshufps(dst, src, xtmp, index, vec_enc); 5016 } else { 5017 vshufps(dst, src, zero, index, vec_enc); 5018 } 5019 } 5020 5021 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5022 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5023 AddressLiteral float_sign_flip, int src_vec_enc) { 5024 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5025 5026 Label done; 5027 // Compare the destination lanes with float_sign_flip 5028 // value to get mask for all special values. 5029 movdqu(xtmp1, float_sign_flip, rscratch); 5030 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5031 ptest(xtmp2, xtmp2); 5032 jccb(Assembler::equal, done); 5033 5034 // Flip float_sign_flip to get max integer value. 5035 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5036 pxor(xtmp1, xtmp4); 5037 5038 // Set detination lanes corresponding to unordered source lanes as zero. 5039 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5040 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5041 5042 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5043 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5044 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5045 5046 // Recompute the mask for remaining special value. 5047 pxor(xtmp2, xtmp3); 5048 // Extract mask corresponding to non-negative source lanes. 5049 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5050 5051 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5052 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5053 pand(xtmp3, xtmp2); 5054 5055 // Replace destination lanes holding special value(0x80000000) with max int 5056 // if corresponding source lane holds a +ve value. 5057 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5058 bind(done); 5059 } 5060 5061 5062 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5063 XMMRegister xtmp, Register rscratch, int vec_enc) { 5064 switch(to_elem_bt) { 5065 case T_SHORT: 5066 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5067 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5068 vpackusdw(dst, dst, zero, vec_enc); 5069 if (vec_enc == Assembler::AVX_256bit) { 5070 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5071 } 5072 break; 5073 case T_BYTE: 5074 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5075 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5076 vpackusdw(dst, dst, zero, vec_enc); 5077 if (vec_enc == Assembler::AVX_256bit) { 5078 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5079 } 5080 vpackuswb(dst, dst, zero, vec_enc); 5081 break; 5082 default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt)); 5083 } 5084 } 5085 5086 /* 5087 * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):- 5088 * a) Perform vector D2L/F2I cast. 5089 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5090 * It signifies that source value could be any of the special floating point 5091 * values(NaN,-Inf,Inf,Max,-Min). 5092 * c) Set destination to zero if source is NaN value. 5093 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5094 */ 5095 5096 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5097 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5098 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5099 int to_elem_sz = type2aelembytes(to_elem_bt); 5100 assert(to_elem_sz <= 4, ""); 5101 vcvttps2dq(dst, src, vec_enc); 5102 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5103 if (to_elem_sz < 4) { 5104 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5105 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5106 } 5107 } 5108 5109 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5110 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5111 Register rscratch, int vec_enc) { 5112 int to_elem_sz = type2aelembytes(to_elem_bt); 5113 assert(to_elem_sz <= 4, ""); 5114 vcvttps2dq(dst, src, vec_enc); 5115 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5116 switch(to_elem_bt) { 5117 case T_INT: 5118 break; 5119 case T_SHORT: 5120 evpmovdw(dst, dst, vec_enc); 5121 break; 5122 case T_BYTE: 5123 evpmovdb(dst, dst, vec_enc); 5124 break; 5125 default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt)); 5126 } 5127 } 5128 5129 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5130 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5131 Register rscratch, int vec_enc) { 5132 evcvttps2qq(dst, src, vec_enc); 5133 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5134 } 5135 5136 // Handling for downcasting from double to integer or sub-word types on AVX2. 5137 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5138 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5139 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5140 int to_elem_sz = type2aelembytes(to_elem_bt); 5141 assert(to_elem_sz < 8, ""); 5142 vcvttpd2dq(dst, src, vec_enc); 5143 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5144 float_sign_flip, vec_enc); 5145 if (to_elem_sz < 4) { 5146 // xtmp4 holds all zero lanes. 5147 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5148 } 5149 } 5150 5151 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5152 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5153 KRegister ktmp2, AddressLiteral sign_flip, 5154 Register rscratch, int vec_enc) { 5155 if (VM_Version::supports_avx512dq()) { 5156 evcvttpd2qq(dst, src, vec_enc); 5157 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5158 switch(to_elem_bt) { 5159 case T_LONG: 5160 break; 5161 case T_INT: 5162 evpmovsqd(dst, dst, vec_enc); 5163 break; 5164 case T_SHORT: 5165 evpmovsqd(dst, dst, vec_enc); 5166 evpmovdw(dst, dst, vec_enc); 5167 break; 5168 case T_BYTE: 5169 evpmovsqd(dst, dst, vec_enc); 5170 evpmovdb(dst, dst, vec_enc); 5171 break; 5172 default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt)); 5173 } 5174 } else { 5175 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5176 vcvttpd2dq(dst, src, vec_enc); 5177 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5178 switch(to_elem_bt) { 5179 case T_INT: 5180 break; 5181 case T_SHORT: 5182 evpmovdw(dst, dst, vec_enc); 5183 break; 5184 case T_BYTE: 5185 evpmovdb(dst, dst, vec_enc); 5186 break; 5187 default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt)); 5188 } 5189 } 5190 } 5191 5192 void C2_MacroAssembler::vector_castF2X_avx10(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5193 switch(to_elem_bt) { 5194 case T_LONG: 5195 evcvttps2qqs(dst, src, vec_enc); 5196 break; 5197 case T_INT: 5198 evcvttps2dqs(dst, src, vec_enc); 5199 break; 5200 case T_SHORT: 5201 evcvttps2dqs(dst, src, vec_enc); 5202 evpmovdw(dst, dst, vec_enc); 5203 break; 5204 case T_BYTE: 5205 evcvttps2dqs(dst, src, vec_enc); 5206 evpmovdb(dst, dst, vec_enc); 5207 break; 5208 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt)); 5209 } 5210 } 5211 5212 void C2_MacroAssembler::vector_castF2X_avx10(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) { 5213 switch(to_elem_bt) { 5214 case T_LONG: 5215 evcvttps2qqs(dst, src, vec_enc); 5216 break; 5217 case T_INT: 5218 evcvttps2dqs(dst, src, vec_enc); 5219 break; 5220 case T_SHORT: 5221 evcvttps2dqs(dst, src, vec_enc); 5222 evpmovdw(dst, dst, vec_enc); 5223 break; 5224 case T_BYTE: 5225 evcvttps2dqs(dst, src, vec_enc); 5226 evpmovdb(dst, dst, vec_enc); 5227 break; 5228 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt)); 5229 } 5230 } 5231 5232 void C2_MacroAssembler::vector_castD2X_avx10(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5233 switch(to_elem_bt) { 5234 case T_LONG: 5235 evcvttpd2qqs(dst, src, vec_enc); 5236 break; 5237 case T_INT: 5238 evcvttpd2dqs(dst, src, vec_enc); 5239 break; 5240 case T_SHORT: 5241 evcvttpd2dqs(dst, src, vec_enc); 5242 evpmovdw(dst, dst, vec_enc); 5243 break; 5244 case T_BYTE: 5245 evcvttpd2dqs(dst, src, vec_enc); 5246 evpmovdb(dst, dst, vec_enc); 5247 break; 5248 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt)); 5249 } 5250 } 5251 5252 void C2_MacroAssembler::vector_castD2X_avx10(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) { 5253 switch(to_elem_bt) { 5254 case T_LONG: 5255 evcvttpd2qqs(dst, src, vec_enc); 5256 break; 5257 case T_INT: 5258 evcvttpd2dqs(dst, src, vec_enc); 5259 break; 5260 case T_SHORT: 5261 evcvttpd2dqs(dst, src, vec_enc); 5262 evpmovdw(dst, dst, vec_enc); 5263 break; 5264 case T_BYTE: 5265 evcvttpd2dqs(dst, src, vec_enc); 5266 evpmovdb(dst, dst, vec_enc); 5267 break; 5268 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt)); 5269 } 5270 } 5271 5272 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5273 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5274 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5275 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5276 // and re-instantiate original MXCSR.RC mode after that. 5277 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5278 5279 mov64(tmp, julong_cast(0.5L)); 5280 evpbroadcastq(xtmp1, tmp, vec_enc); 5281 vaddpd(xtmp1, src , xtmp1, vec_enc); 5282 evcvtpd2qq(dst, xtmp1, vec_enc); 5283 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5284 double_sign_flip, vec_enc);; 5285 5286 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5287 } 5288 5289 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5290 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5291 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5292 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5293 // and re-instantiate original MXCSR.RC mode after that. 5294 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5295 5296 movl(tmp, jint_cast(0.5)); 5297 movq(xtmp1, tmp); 5298 vbroadcastss(xtmp1, xtmp1, vec_enc); 5299 vaddps(xtmp1, src , xtmp1, vec_enc); 5300 vcvtps2dq(dst, xtmp1, vec_enc); 5301 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5302 float_sign_flip, vec_enc); 5303 5304 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5305 } 5306 5307 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5308 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5309 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5310 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5311 // and re-instantiate original MXCSR.RC mode after that. 5312 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5313 5314 movl(tmp, jint_cast(0.5)); 5315 movq(xtmp1, tmp); 5316 vbroadcastss(xtmp1, xtmp1, vec_enc); 5317 vaddps(xtmp1, src , xtmp1, vec_enc); 5318 vcvtps2dq(dst, xtmp1, vec_enc); 5319 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5320 5321 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5322 } 5323 5324 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5325 BasicType from_elem_bt, BasicType to_elem_bt) { 5326 switch (from_elem_bt) { 5327 case T_BYTE: 5328 switch (to_elem_bt) { 5329 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5330 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5331 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5332 default: ShouldNotReachHere(); 5333 } 5334 break; 5335 case T_SHORT: 5336 switch (to_elem_bt) { 5337 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5338 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5339 default: ShouldNotReachHere(); 5340 } 5341 break; 5342 case T_INT: 5343 assert(to_elem_bt == T_LONG, ""); 5344 vpmovzxdq(dst, src, vlen_enc); 5345 break; 5346 default: 5347 ShouldNotReachHere(); 5348 } 5349 } 5350 5351 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5352 BasicType from_elem_bt, BasicType to_elem_bt) { 5353 switch (from_elem_bt) { 5354 case T_BYTE: 5355 switch (to_elem_bt) { 5356 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5357 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5358 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5359 default: ShouldNotReachHere(); 5360 } 5361 break; 5362 case T_SHORT: 5363 switch (to_elem_bt) { 5364 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5365 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5366 default: ShouldNotReachHere(); 5367 } 5368 break; 5369 case T_INT: 5370 assert(to_elem_bt == T_LONG, ""); 5371 vpmovsxdq(dst, src, vlen_enc); 5372 break; 5373 default: 5374 ShouldNotReachHere(); 5375 } 5376 } 5377 5378 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5379 BasicType dst_bt, BasicType src_bt, int vlen) { 5380 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5381 assert(vlen_enc != AVX_512bit, ""); 5382 5383 int dst_bt_size = type2aelembytes(dst_bt); 5384 int src_bt_size = type2aelembytes(src_bt); 5385 if (dst_bt_size > src_bt_size) { 5386 switch (dst_bt_size / src_bt_size) { 5387 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5388 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5389 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5390 default: ShouldNotReachHere(); 5391 } 5392 } else { 5393 assert(dst_bt_size < src_bt_size, ""); 5394 switch (src_bt_size / dst_bt_size) { 5395 case 2: { 5396 if (vlen_enc == AVX_128bit) { 5397 vpacksswb(dst, src, src, vlen_enc); 5398 } else { 5399 vpacksswb(dst, src, src, vlen_enc); 5400 vpermq(dst, dst, 0x08, vlen_enc); 5401 } 5402 break; 5403 } 5404 case 4: { 5405 if (vlen_enc == AVX_128bit) { 5406 vpackssdw(dst, src, src, vlen_enc); 5407 vpacksswb(dst, dst, dst, vlen_enc); 5408 } else { 5409 vpackssdw(dst, src, src, vlen_enc); 5410 vpermq(dst, dst, 0x08, vlen_enc); 5411 vpacksswb(dst, dst, dst, AVX_128bit); 5412 } 5413 break; 5414 } 5415 case 8: { 5416 if (vlen_enc == AVX_128bit) { 5417 vpshufd(dst, src, 0x08, vlen_enc); 5418 vpackssdw(dst, dst, dst, vlen_enc); 5419 vpacksswb(dst, dst, dst, vlen_enc); 5420 } else { 5421 vpshufd(dst, src, 0x08, vlen_enc); 5422 vpermq(dst, dst, 0x08, vlen_enc); 5423 vpackssdw(dst, dst, dst, AVX_128bit); 5424 vpacksswb(dst, dst, dst, AVX_128bit); 5425 } 5426 break; 5427 } 5428 default: ShouldNotReachHere(); 5429 } 5430 } 5431 } 5432 5433 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5434 bool merge, BasicType bt, int vlen_enc) { 5435 if (bt == T_INT) { 5436 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5437 } else { 5438 assert(bt == T_LONG, ""); 5439 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5440 } 5441 } 5442 5443 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5444 bool merge, BasicType bt, int vlen_enc) { 5445 if (bt == T_INT) { 5446 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5447 } else { 5448 assert(bt == T_LONG, ""); 5449 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5450 } 5451 } 5452 5453 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5454 Register rtmp2, XMMRegister xtmp, int mask_len, 5455 int vec_enc) { 5456 int index = 0; 5457 int vindex = 0; 5458 mov64(rtmp1, 0x0101010101010101L); 5459 pdepq(rtmp1, src, rtmp1); 5460 if (mask_len > 8) { 5461 movq(rtmp2, src); 5462 vpxor(xtmp, xtmp, xtmp, vec_enc); 5463 movq(xtmp, rtmp1); 5464 } 5465 movq(dst, rtmp1); 5466 5467 mask_len -= 8; 5468 while (mask_len > 0) { 5469 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5470 index++; 5471 if ((index % 2) == 0) { 5472 pxor(xtmp, xtmp); 5473 } 5474 mov64(rtmp1, 0x0101010101010101L); 5475 shrq(rtmp2, 8); 5476 pdepq(rtmp1, rtmp2, rtmp1); 5477 pinsrq(xtmp, rtmp1, index % 2); 5478 vindex = index / 2; 5479 if (vindex) { 5480 // Write entire 16 byte vector when both 64 bit 5481 // lanes are update to save redundant instructions. 5482 if (index % 2) { 5483 vinsertf128(dst, dst, xtmp, vindex); 5484 } 5485 } else { 5486 vmovdqu(dst, xtmp); 5487 } 5488 mask_len -= 8; 5489 } 5490 } 5491 5492 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5493 switch(opc) { 5494 case Op_VectorMaskTrueCount: 5495 popcntq(dst, tmp); 5496 break; 5497 case Op_VectorMaskLastTrue: 5498 if (VM_Version::supports_lzcnt()) { 5499 lzcntq(tmp, tmp); 5500 movl(dst, 63); 5501 subl(dst, tmp); 5502 } else { 5503 movl(dst, -1); 5504 bsrq(tmp, tmp); 5505 cmov32(Assembler::notZero, dst, tmp); 5506 } 5507 break; 5508 case Op_VectorMaskFirstTrue: 5509 if (VM_Version::supports_bmi1()) { 5510 if (masklen < 32) { 5511 orl(tmp, 1 << masklen); 5512 tzcntl(dst, tmp); 5513 } else if (masklen == 32) { 5514 tzcntl(dst, tmp); 5515 } else { 5516 assert(masklen == 64, ""); 5517 tzcntq(dst, tmp); 5518 } 5519 } else { 5520 if (masklen < 32) { 5521 orl(tmp, 1 << masklen); 5522 bsfl(dst, tmp); 5523 } else { 5524 assert(masklen == 32 || masklen == 64, ""); 5525 movl(dst, masklen); 5526 if (masklen == 32) { 5527 bsfl(tmp, tmp); 5528 } else { 5529 bsfq(tmp, tmp); 5530 } 5531 cmov32(Assembler::notZero, dst, tmp); 5532 } 5533 } 5534 break; 5535 case Op_VectorMaskToLong: 5536 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5537 break; 5538 default: assert(false, "Unhandled mask operation"); 5539 } 5540 } 5541 5542 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5543 int masklen, int masksize, int vec_enc) { 5544 assert(VM_Version::supports_popcnt(), ""); 5545 5546 if(VM_Version::supports_avx512bw()) { 5547 kmovql(tmp, mask); 5548 } else { 5549 assert(masklen <= 16, ""); 5550 kmovwl(tmp, mask); 5551 } 5552 5553 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5554 // operations needs to be clipped. 5555 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5556 andq(tmp, (1 << masklen) - 1); 5557 } 5558 5559 vector_mask_operation_helper(opc, dst, tmp, masklen); 5560 } 5561 5562 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5563 Register tmp, int masklen, BasicType bt, int vec_enc) { 5564 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5565 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5566 assert(VM_Version::supports_popcnt(), ""); 5567 5568 bool need_clip = false; 5569 switch(bt) { 5570 case T_BOOLEAN: 5571 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5572 vpxor(xtmp, xtmp, xtmp, vec_enc); 5573 vpsubb(xtmp, xtmp, mask, vec_enc); 5574 vpmovmskb(tmp, xtmp, vec_enc); 5575 need_clip = masklen < 16; 5576 break; 5577 case T_BYTE: 5578 vpmovmskb(tmp, mask, vec_enc); 5579 need_clip = masklen < 16; 5580 break; 5581 case T_SHORT: 5582 vpacksswb(xtmp, mask, mask, vec_enc); 5583 if (masklen >= 16) { 5584 vpermpd(xtmp, xtmp, 8, vec_enc); 5585 } 5586 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5587 need_clip = masklen < 16; 5588 break; 5589 case T_INT: 5590 case T_FLOAT: 5591 vmovmskps(tmp, mask, vec_enc); 5592 need_clip = masklen < 4; 5593 break; 5594 case T_LONG: 5595 case T_DOUBLE: 5596 vmovmskpd(tmp, mask, vec_enc); 5597 need_clip = masklen < 2; 5598 break; 5599 default: assert(false, "Unhandled type, %s", type2name(bt)); 5600 } 5601 5602 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5603 // operations needs to be clipped. 5604 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5605 // need_clip implies masklen < 32 5606 andq(tmp, (1 << masklen) - 1); 5607 } 5608 5609 vector_mask_operation_helper(opc, dst, tmp, masklen); 5610 } 5611 5612 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5613 Register rtmp2, int mask_len) { 5614 kmov(rtmp1, src); 5615 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5616 mov64(rtmp2, -1L); 5617 pextq(rtmp2, rtmp2, rtmp1); 5618 kmov(dst, rtmp2); 5619 } 5620 5621 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5622 XMMRegister mask, Register rtmp, Register rscratch, 5623 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5624 int vec_enc) { 5625 assert(type2aelembytes(bt) >= 4, ""); 5626 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5627 address compress_perm_table = nullptr; 5628 address expand_perm_table = nullptr; 5629 if (type2aelembytes(bt) == 8) { 5630 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5631 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5632 vmovmskpd(rtmp, mask, vec_enc); 5633 } else { 5634 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5635 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5636 vmovmskps(rtmp, mask, vec_enc); 5637 } 5638 shlq(rtmp, 5); // for 32 byte permute row. 5639 if (opcode == Op_CompressV) { 5640 lea(rscratch, ExternalAddress(compress_perm_table)); 5641 } else { 5642 lea(rscratch, ExternalAddress(expand_perm_table)); 5643 } 5644 addptr(rtmp, rscratch); 5645 vmovdqu(permv, Address(rtmp)); 5646 vpermps(dst, permv, src, Assembler::AVX_256bit); 5647 vpxor(xtmp, xtmp, xtmp, vec_enc); 5648 // Blend the result with zero vector using permute mask, each column entry 5649 // in a permute table row contains either a valid permute index or a -1 (default) 5650 // value, this can potentially be used as a blending mask after 5651 // compressing/expanding the source vector lanes. 5652 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv); 5653 } 5654 5655 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5656 bool merge, BasicType bt, int vec_enc) { 5657 if (opcode == Op_CompressV) { 5658 switch(bt) { 5659 case T_BYTE: 5660 evpcompressb(dst, mask, src, merge, vec_enc); 5661 break; 5662 case T_CHAR: 5663 case T_SHORT: 5664 evpcompressw(dst, mask, src, merge, vec_enc); 5665 break; 5666 case T_INT: 5667 evpcompressd(dst, mask, src, merge, vec_enc); 5668 break; 5669 case T_FLOAT: 5670 evcompressps(dst, mask, src, merge, vec_enc); 5671 break; 5672 case T_LONG: 5673 evpcompressq(dst, mask, src, merge, vec_enc); 5674 break; 5675 case T_DOUBLE: 5676 evcompresspd(dst, mask, src, merge, vec_enc); 5677 break; 5678 default: 5679 fatal("Unsupported type %s", type2name(bt)); 5680 break; 5681 } 5682 } else { 5683 assert(opcode == Op_ExpandV, ""); 5684 switch(bt) { 5685 case T_BYTE: 5686 evpexpandb(dst, mask, src, merge, vec_enc); 5687 break; 5688 case T_CHAR: 5689 case T_SHORT: 5690 evpexpandw(dst, mask, src, merge, vec_enc); 5691 break; 5692 case T_INT: 5693 evpexpandd(dst, mask, src, merge, vec_enc); 5694 break; 5695 case T_FLOAT: 5696 evexpandps(dst, mask, src, merge, vec_enc); 5697 break; 5698 case T_LONG: 5699 evpexpandq(dst, mask, src, merge, vec_enc); 5700 break; 5701 case T_DOUBLE: 5702 evexpandpd(dst, mask, src, merge, vec_enc); 5703 break; 5704 default: 5705 fatal("Unsupported type %s", type2name(bt)); 5706 break; 5707 } 5708 } 5709 } 5710 5711 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5712 KRegister ktmp1, int vec_enc) { 5713 if (opcode == Op_SignumVD) { 5714 vsubpd(dst, zero, one, vec_enc); 5715 // if src < 0 ? -1 : 1 5716 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5717 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5718 // if src == NaN, -0.0 or 0.0 return src. 5719 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5720 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5721 } else { 5722 assert(opcode == Op_SignumVF, ""); 5723 vsubps(dst, zero, one, vec_enc); 5724 // if src < 0 ? -1 : 1 5725 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5726 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5727 // if src == NaN, -0.0 or 0.0 return src. 5728 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5729 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5730 } 5731 } 5732 5733 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5734 XMMRegister xtmp1, int vec_enc) { 5735 if (opcode == Op_SignumVD) { 5736 vsubpd(dst, zero, one, vec_enc); 5737 // if src < 0 ? -1 : 1 5738 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5739 // if src == NaN, -0.0 or 0.0 return src. 5740 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5741 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5742 } else { 5743 assert(opcode == Op_SignumVF, ""); 5744 vsubps(dst, zero, one, vec_enc); 5745 // if src < 0 ? -1 : 1 5746 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5747 // if src == NaN, -0.0 or 0.0 return src. 5748 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5749 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5750 } 5751 } 5752 5753 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5754 if (VM_Version::supports_avx512bw()) { 5755 if (mask_len > 32) { 5756 kmovql(dst, src); 5757 } else { 5758 kmovdl(dst, src); 5759 if (mask_len != 32) { 5760 kshiftrdl(dst, dst, 32 - mask_len); 5761 } 5762 } 5763 } else { 5764 assert(mask_len <= 16, ""); 5765 kmovwl(dst, src); 5766 if (mask_len != 16) { 5767 kshiftrwl(dst, dst, 16 - mask_len); 5768 } 5769 } 5770 } 5771 5772 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5773 int lane_size = type2aelembytes(bt); 5774 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5775 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) { 5776 movptr(rtmp, imm32); 5777 switch(lane_size) { 5778 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5779 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5780 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5781 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5782 fatal("Unsupported lane size %d", lane_size); 5783 break; 5784 } 5785 } else { 5786 movptr(rtmp, imm32); 5787 movq(dst, rtmp); 5788 switch(lane_size) { 5789 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5790 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5791 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5792 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5793 fatal("Unsupported lane size %d", lane_size); 5794 break; 5795 } 5796 } 5797 } 5798 5799 // 5800 // Following is lookup table based popcount computation algorithm:- 5801 // Index Bit set count 5802 // [ 0000 -> 0, 5803 // 0001 -> 1, 5804 // 0010 -> 1, 5805 // 0011 -> 2, 5806 // 0100 -> 1, 5807 // 0101 -> 2, 5808 // 0110 -> 2, 5809 // 0111 -> 3, 5810 // 1000 -> 1, 5811 // 1001 -> 2, 5812 // 1010 -> 3, 5813 // 1011 -> 3, 5814 // 1100 -> 2, 5815 // 1101 -> 3, 5816 // 1111 -> 4 ] 5817 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5818 // shuffle indices for lookup table access. 5819 // b. Right shift each byte of vector lane by 4 positions. 5820 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5821 // shuffle indices for lookup table access. 5822 // d. Add the bitset count of upper and lower 4 bits of each byte. 5823 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5824 // count of all the bytes of a quadword. 5825 // f. Perform step e. for upper 128bit vector lane. 5826 // g. Pack the bitset count of quadwords back to double word. 5827 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5828 5829 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5830 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5831 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5832 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5833 vpsrlw(dst, src, 4, vec_enc); 5834 vpand(dst, dst, xtmp1, vec_enc); 5835 vpand(xtmp1, src, xtmp1, vec_enc); 5836 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5837 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5838 vpshufb(dst, xtmp2, dst, vec_enc); 5839 vpaddb(dst, dst, xtmp1, vec_enc); 5840 } 5841 5842 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5843 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5844 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5845 // Following code is as per steps e,f,g and h of above algorithm. 5846 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5847 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5848 vpsadbw(dst, dst, xtmp2, vec_enc); 5849 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5850 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5851 vpackuswb(dst, xtmp1, dst, vec_enc); 5852 } 5853 5854 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5855 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5856 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5857 // Add the popcount of upper and lower bytes of word. 5858 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5859 vpsrlw(dst, xtmp1, 8, vec_enc); 5860 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5861 vpaddw(dst, dst, xtmp1, vec_enc); 5862 } 5863 5864 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5865 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5866 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5867 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5868 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5869 } 5870 5871 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5872 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5873 switch(bt) { 5874 case T_LONG: 5875 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5876 break; 5877 case T_INT: 5878 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5879 break; 5880 case T_CHAR: 5881 case T_SHORT: 5882 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5883 break; 5884 case T_BYTE: 5885 case T_BOOLEAN: 5886 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5887 break; 5888 default: 5889 fatal("Unsupported type %s", type2name(bt)); 5890 break; 5891 } 5892 } 5893 5894 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5895 KRegister mask, bool merge, int vec_enc) { 5896 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5897 switch(bt) { 5898 case T_LONG: 5899 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5900 evpopcntq(dst, mask, src, merge, vec_enc); 5901 break; 5902 case T_INT: 5903 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5904 evpopcntd(dst, mask, src, merge, vec_enc); 5905 break; 5906 case T_CHAR: 5907 case T_SHORT: 5908 assert(VM_Version::supports_avx512_bitalg(), ""); 5909 evpopcntw(dst, mask, src, merge, vec_enc); 5910 break; 5911 case T_BYTE: 5912 case T_BOOLEAN: 5913 assert(VM_Version::supports_avx512_bitalg(), ""); 5914 evpopcntb(dst, mask, src, merge, vec_enc); 5915 break; 5916 default: 5917 fatal("Unsupported type %s", type2name(bt)); 5918 break; 5919 } 5920 } 5921 5922 // Bit reversal algorithm first reverses the bits of each byte followed by 5923 // a byte level reversal for multi-byte primitive types (short/int/long). 5924 // Algorithm performs a lookup table access to get reverse bit sequence 5925 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5926 // is obtained by swapping the reverse bit sequences of upper and lower 5927 // nibble of a byte. 5928 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5929 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5930 if (VM_Version::supports_avx512vlbw()) { 5931 5932 // Get the reverse bit sequence of lower nibble of each byte. 5933 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5934 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5935 evpandq(dst, xtmp2, src, vec_enc); 5936 vpshufb(dst, xtmp1, dst, vec_enc); 5937 vpsllq(dst, dst, 4, vec_enc); 5938 5939 // Get the reverse bit sequence of upper nibble of each byte. 5940 vpandn(xtmp2, xtmp2, src, vec_enc); 5941 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5942 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5943 5944 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5945 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5946 evporq(xtmp2, dst, xtmp2, vec_enc); 5947 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5948 5949 } else if(vec_enc == Assembler::AVX_512bit) { 5950 // Shift based bit reversal. 5951 assert(bt == T_LONG || bt == T_INT, ""); 5952 5953 // Swap lower and upper nibble of each byte. 5954 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5955 5956 // Swap two least and most significant bits of each nibble. 5957 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5958 5959 // Swap adjacent pair of bits. 5960 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5961 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5962 5963 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5964 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5965 } else { 5966 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5967 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5968 5969 // Get the reverse bit sequence of lower nibble of each byte. 5970 vpand(dst, xtmp2, src, vec_enc); 5971 vpshufb(dst, xtmp1, dst, vec_enc); 5972 vpsllq(dst, dst, 4, vec_enc); 5973 5974 // Get the reverse bit sequence of upper nibble of each byte. 5975 vpandn(xtmp2, xtmp2, src, vec_enc); 5976 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5977 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5978 5979 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5980 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5981 vpor(xtmp2, dst, xtmp2, vec_enc); 5982 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5983 } 5984 } 5985 5986 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5987 XMMRegister xtmp, Register rscratch) { 5988 assert(VM_Version::supports_gfni(), ""); 5989 assert(rscratch != noreg || always_reachable(mask), "missing"); 5990 5991 // Galois field instruction based bit reversal based on following algorithm. 5992 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5993 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5994 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5995 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5996 } 5997 5998 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5999 XMMRegister xtmp1, Register rtmp, int vec_enc) { 6000 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 6001 evpandq(dst, xtmp1, src, vec_enc); 6002 vpsllq(dst, dst, nbits, vec_enc); 6003 vpandn(xtmp1, xtmp1, src, vec_enc); 6004 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 6005 evporq(dst, dst, xtmp1, vec_enc); 6006 } 6007 6008 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6009 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6010 // Shift based bit reversal. 6011 assert(VM_Version::supports_evex(), ""); 6012 switch(bt) { 6013 case T_LONG: 6014 // Swap upper and lower double word of each quad word. 6015 evprorq(xtmp1, k0, src, 32, true, vec_enc); 6016 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 6017 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6018 break; 6019 case T_INT: 6020 // Swap upper and lower word of each double word. 6021 evprord(xtmp1, k0, src, 16, true, vec_enc); 6022 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6023 break; 6024 case T_CHAR: 6025 case T_SHORT: 6026 // Swap upper and lower byte of each word. 6027 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6028 break; 6029 case T_BYTE: 6030 evmovdquq(dst, k0, src, true, vec_enc); 6031 break; 6032 default: 6033 fatal("Unsupported type %s", type2name(bt)); 6034 break; 6035 } 6036 } 6037 6038 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6039 if (bt == T_BYTE) { 6040 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6041 evmovdquq(dst, k0, src, true, vec_enc); 6042 } else { 6043 vmovdqu(dst, src); 6044 } 6045 return; 6046 } 6047 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6048 // pre-computed shuffle indices. 6049 switch(bt) { 6050 case T_LONG: 6051 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6052 break; 6053 case T_INT: 6054 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6055 break; 6056 case T_CHAR: 6057 case T_SHORT: 6058 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6059 break; 6060 default: 6061 fatal("Unsupported type %s", type2name(bt)); 6062 break; 6063 } 6064 vpshufb(dst, src, dst, vec_enc); 6065 } 6066 6067 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6068 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6069 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6070 assert(is_integral_type(bt), ""); 6071 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6072 assert(VM_Version::supports_avx512cd(), ""); 6073 switch(bt) { 6074 case T_LONG: 6075 evplzcntq(dst, ktmp, src, merge, vec_enc); 6076 break; 6077 case T_INT: 6078 evplzcntd(dst, ktmp, src, merge, vec_enc); 6079 break; 6080 case T_SHORT: 6081 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6082 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6083 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6084 vpunpckhwd(dst, xtmp1, src, vec_enc); 6085 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6086 vpackusdw(dst, xtmp2, dst, vec_enc); 6087 break; 6088 case T_BYTE: 6089 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6090 // accessing the lookup table. 6091 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6092 // accessing the lookup table. 6093 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6094 assert(VM_Version::supports_avx512bw(), ""); 6095 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6096 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6097 vpand(xtmp2, dst, src, vec_enc); 6098 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6099 vpsrlw(xtmp3, src, 4, vec_enc); 6100 vpand(xtmp3, dst, xtmp3, vec_enc); 6101 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6102 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6103 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6104 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6105 break; 6106 default: 6107 fatal("Unsupported type %s", type2name(bt)); 6108 break; 6109 } 6110 } 6111 6112 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6113 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6114 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6115 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6116 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6117 // accessing the lookup table. 6118 vpand(dst, xtmp2, src, vec_enc); 6119 vpshufb(dst, xtmp1, dst, vec_enc); 6120 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6121 // accessing the lookup table. 6122 vpsrlw(xtmp3, src, 4, vec_enc); 6123 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6124 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6125 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6126 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6127 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6128 vpaddb(dst, dst, xtmp2, vec_enc); 6129 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6130 } 6131 6132 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6133 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6134 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6135 // Add zero counts of lower byte and upper byte of a word if 6136 // upper byte holds a zero value. 6137 vpsrlw(xtmp3, src, 8, vec_enc); 6138 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6139 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6140 vpsllw(xtmp2, dst, 8, vec_enc); 6141 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6142 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6143 vpsrlw(dst, dst, 8, vec_enc); 6144 } 6145 6146 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6147 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6148 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6149 // hence biased exponent can be used to compute leading zero count as per 6150 // following formula:- 6151 // LZCNT = 31 - (biased_exp - 127) 6152 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6153 6154 // Broadcast 0xFF 6155 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6156 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6157 6158 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher 6159 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit 6160 // contributes to the leading number of zeros. 6161 vpsrld(xtmp2, src, 1, vec_enc); 6162 vpandn(xtmp3, xtmp2, src, vec_enc); 6163 6164 // Extract biased exponent. 6165 vcvtdq2ps(dst, xtmp3, vec_enc); 6166 vpsrld(dst, dst, 23, vec_enc); 6167 vpand(dst, dst, xtmp1, vec_enc); 6168 6169 // Broadcast 127. 6170 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6171 // Exponent = biased_exp - 127 6172 vpsubd(dst, dst, xtmp1, vec_enc); 6173 6174 // Exponent_plus_one = Exponent + 1 6175 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6176 vpaddd(dst, dst, xtmp3, vec_enc); 6177 6178 // Replace -ve exponent with zero, exponent is -ve when src 6179 // lane contains a zero value. 6180 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6181 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6182 6183 // Rematerialize broadcast 32. 6184 vpslld(xtmp1, xtmp3, 5, vec_enc); 6185 // Exponent is 32 if corresponding source lane contains max_int value. 6186 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6187 // LZCNT = 32 - exponent_plus_one 6188 vpsubd(dst, xtmp1, dst, vec_enc); 6189 6190 // Replace LZCNT with a value 1 if corresponding source lane 6191 // contains max_int value. 6192 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6193 6194 // Replace biased_exp with 0 if source lane value is less than zero. 6195 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6196 vblendvps(dst, dst, xtmp2, src, vec_enc); 6197 } 6198 6199 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6200 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6201 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6202 // Add zero counts of lower word and upper word of a double word if 6203 // upper word holds a zero value. 6204 vpsrld(xtmp3, src, 16, vec_enc); 6205 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6206 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6207 vpslld(xtmp2, dst, 16, vec_enc); 6208 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6209 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6210 vpsrld(dst, dst, 16, vec_enc); 6211 // Add zero counts of lower doubleword and upper doubleword of a 6212 // quadword if upper doubleword holds a zero value. 6213 vpsrlq(xtmp3, src, 32, vec_enc); 6214 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6215 vpsllq(xtmp2, dst, 32, vec_enc); 6216 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6217 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6218 vpsrlq(dst, dst, 32, vec_enc); 6219 } 6220 6221 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6222 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6223 Register rtmp, int vec_enc) { 6224 assert(is_integral_type(bt), "unexpected type"); 6225 assert(vec_enc < Assembler::AVX_512bit, ""); 6226 switch(bt) { 6227 case T_LONG: 6228 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6229 break; 6230 case T_INT: 6231 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6232 break; 6233 case T_SHORT: 6234 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6235 break; 6236 case T_BYTE: 6237 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6238 break; 6239 default: 6240 fatal("Unsupported type %s", type2name(bt)); 6241 break; 6242 } 6243 } 6244 6245 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6246 switch(bt) { 6247 case T_BYTE: 6248 vpsubb(dst, src1, src2, vec_enc); 6249 break; 6250 case T_SHORT: 6251 vpsubw(dst, src1, src2, vec_enc); 6252 break; 6253 case T_INT: 6254 vpsubd(dst, src1, src2, vec_enc); 6255 break; 6256 case T_LONG: 6257 vpsubq(dst, src1, src2, vec_enc); 6258 break; 6259 default: 6260 fatal("Unsupported type %s", type2name(bt)); 6261 break; 6262 } 6263 } 6264 6265 // Trailing zero count computation is based on leading zero count operation as per 6266 // following equation. All AVX3 targets support AVX512CD feature which offers 6267 // direct vector instruction to compute leading zero count. 6268 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6269 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6270 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6271 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6272 assert(is_integral_type(bt), ""); 6273 // xtmp = -1 6274 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6275 // xtmp = xtmp + src 6276 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6277 // xtmp = xtmp & ~src 6278 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6279 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6280 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6281 vpsub(bt, dst, xtmp4, dst, vec_enc); 6282 } 6283 6284 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6285 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6286 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6287 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6288 assert(is_integral_type(bt), ""); 6289 // xtmp = 0 6290 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6291 // xtmp = 0 - src 6292 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6293 // xtmp = xtmp | src 6294 vpor(xtmp3, xtmp3, src, vec_enc); 6295 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6296 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6297 vpsub(bt, dst, xtmp1, dst, vec_enc); 6298 } 6299 6300 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6301 Label done; 6302 Label neg_divisor_fastpath; 6303 cmpl(divisor, 0); 6304 jccb(Assembler::less, neg_divisor_fastpath); 6305 xorl(rdx, rdx); 6306 divl(divisor); 6307 jmpb(done); 6308 bind(neg_divisor_fastpath); 6309 // Fastpath for divisor < 0: 6310 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6311 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6312 movl(rdx, rax); 6313 subl(rdx, divisor); 6314 if (VM_Version::supports_bmi1()) { 6315 andnl(rax, rdx, rax); 6316 } else { 6317 notl(rdx); 6318 andl(rax, rdx); 6319 } 6320 shrl(rax, 31); 6321 bind(done); 6322 } 6323 6324 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6325 Label done; 6326 Label neg_divisor_fastpath; 6327 cmpl(divisor, 0); 6328 jccb(Assembler::less, neg_divisor_fastpath); 6329 xorl(rdx, rdx); 6330 divl(divisor); 6331 jmpb(done); 6332 bind(neg_divisor_fastpath); 6333 // Fastpath when divisor < 0: 6334 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6335 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6336 movl(rdx, rax); 6337 subl(rax, divisor); 6338 if (VM_Version::supports_bmi1()) { 6339 andnl(rax, rax, rdx); 6340 } else { 6341 notl(rax); 6342 andl(rax, rdx); 6343 } 6344 sarl(rax, 31); 6345 andl(rax, divisor); 6346 subl(rdx, rax); 6347 bind(done); 6348 } 6349 6350 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6351 Label done; 6352 Label neg_divisor_fastpath; 6353 6354 cmpl(divisor, 0); 6355 jccb(Assembler::less, neg_divisor_fastpath); 6356 xorl(rdx, rdx); 6357 divl(divisor); 6358 jmpb(done); 6359 bind(neg_divisor_fastpath); 6360 // Fastpath for divisor < 0: 6361 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6362 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6363 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6364 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6365 movl(rdx, rax); 6366 subl(rax, divisor); 6367 if (VM_Version::supports_bmi1()) { 6368 andnl(rax, rax, rdx); 6369 } else { 6370 notl(rax); 6371 andl(rax, rdx); 6372 } 6373 movl(tmp, rax); 6374 shrl(rax, 31); // quotient 6375 sarl(tmp, 31); 6376 andl(tmp, divisor); 6377 subl(rdx, tmp); // remainder 6378 bind(done); 6379 } 6380 6381 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6382 XMMRegister xtmp2, Register rtmp) { 6383 if(VM_Version::supports_gfni()) { 6384 // Galois field instruction based bit reversal based on following algorithm. 6385 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6386 mov64(rtmp, 0x8040201008040201L); 6387 movq(xtmp1, src); 6388 movq(xtmp2, rtmp); 6389 gf2p8affineqb(xtmp1, xtmp2, 0); 6390 movq(dst, xtmp1); 6391 } else { 6392 // Swap even and odd numbered bits. 6393 movl(rtmp, src); 6394 andl(rtmp, 0x55555555); 6395 shll(rtmp, 1); 6396 movl(dst, src); 6397 andl(dst, 0xAAAAAAAA); 6398 shrl(dst, 1); 6399 orl(dst, rtmp); 6400 6401 // Swap LSB and MSB 2 bits of each nibble. 6402 movl(rtmp, dst); 6403 andl(rtmp, 0x33333333); 6404 shll(rtmp, 2); 6405 andl(dst, 0xCCCCCCCC); 6406 shrl(dst, 2); 6407 orl(dst, rtmp); 6408 6409 // Swap LSB and MSB 4 bits of each byte. 6410 movl(rtmp, dst); 6411 andl(rtmp, 0x0F0F0F0F); 6412 shll(rtmp, 4); 6413 andl(dst, 0xF0F0F0F0); 6414 shrl(dst, 4); 6415 orl(dst, rtmp); 6416 } 6417 bswapl(dst); 6418 } 6419 6420 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6421 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6422 if(VM_Version::supports_gfni()) { 6423 // Galois field instruction based bit reversal based on following algorithm. 6424 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6425 mov64(rtmp1, 0x8040201008040201L); 6426 movq(xtmp1, src); 6427 movq(xtmp2, rtmp1); 6428 gf2p8affineqb(xtmp1, xtmp2, 0); 6429 movq(dst, xtmp1); 6430 } else { 6431 // Swap even and odd numbered bits. 6432 movq(rtmp1, src); 6433 mov64(rtmp2, 0x5555555555555555L); 6434 andq(rtmp1, rtmp2); 6435 shlq(rtmp1, 1); 6436 movq(dst, src); 6437 notq(rtmp2); 6438 andq(dst, rtmp2); 6439 shrq(dst, 1); 6440 orq(dst, rtmp1); 6441 6442 // Swap LSB and MSB 2 bits of each nibble. 6443 movq(rtmp1, dst); 6444 mov64(rtmp2, 0x3333333333333333L); 6445 andq(rtmp1, rtmp2); 6446 shlq(rtmp1, 2); 6447 notq(rtmp2); 6448 andq(dst, rtmp2); 6449 shrq(dst, 2); 6450 orq(dst, rtmp1); 6451 6452 // Swap LSB and MSB 4 bits of each byte. 6453 movq(rtmp1, dst); 6454 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6455 andq(rtmp1, rtmp2); 6456 shlq(rtmp1, 4); 6457 notq(rtmp2); 6458 andq(dst, rtmp2); 6459 shrq(dst, 4); 6460 orq(dst, rtmp1); 6461 } 6462 bswapq(dst); 6463 } 6464 6465 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6466 Label done; 6467 Label neg_divisor_fastpath; 6468 cmpq(divisor, 0); 6469 jccb(Assembler::less, neg_divisor_fastpath); 6470 xorl(rdx, rdx); 6471 divq(divisor); 6472 jmpb(done); 6473 bind(neg_divisor_fastpath); 6474 // Fastpath for divisor < 0: 6475 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6476 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6477 movq(rdx, rax); 6478 subq(rdx, divisor); 6479 if (VM_Version::supports_bmi1()) { 6480 andnq(rax, rdx, rax); 6481 } else { 6482 notq(rdx); 6483 andq(rax, rdx); 6484 } 6485 shrq(rax, 63); 6486 bind(done); 6487 } 6488 6489 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6490 Label done; 6491 Label neg_divisor_fastpath; 6492 cmpq(divisor, 0); 6493 jccb(Assembler::less, neg_divisor_fastpath); 6494 xorq(rdx, rdx); 6495 divq(divisor); 6496 jmp(done); 6497 bind(neg_divisor_fastpath); 6498 // Fastpath when divisor < 0: 6499 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6500 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6501 movq(rdx, rax); 6502 subq(rax, divisor); 6503 if (VM_Version::supports_bmi1()) { 6504 andnq(rax, rax, rdx); 6505 } else { 6506 notq(rax); 6507 andq(rax, rdx); 6508 } 6509 sarq(rax, 63); 6510 andq(rax, divisor); 6511 subq(rdx, rax); 6512 bind(done); 6513 } 6514 6515 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6516 Label done; 6517 Label neg_divisor_fastpath; 6518 cmpq(divisor, 0); 6519 jccb(Assembler::less, neg_divisor_fastpath); 6520 xorq(rdx, rdx); 6521 divq(divisor); 6522 jmp(done); 6523 bind(neg_divisor_fastpath); 6524 // Fastpath for divisor < 0: 6525 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6526 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6527 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6528 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6529 movq(rdx, rax); 6530 subq(rax, divisor); 6531 if (VM_Version::supports_bmi1()) { 6532 andnq(rax, rax, rdx); 6533 } else { 6534 notq(rax); 6535 andq(rax, rdx); 6536 } 6537 movq(tmp, rax); 6538 shrq(rax, 63); // quotient 6539 sarq(tmp, 63); 6540 andq(tmp, divisor); 6541 subq(rdx, tmp); // remainder 6542 bind(done); 6543 } 6544 6545 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6546 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6547 int vlen_enc) { 6548 assert(VM_Version::supports_avx512bw(), ""); 6549 // Byte shuffles are inlane operations and indices are determined using 6550 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6551 // normalized to index range 0-15. This makes sure that all the multiples 6552 // of an index value are placed at same relative position in 128 bit 6553 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6554 // will be 16th element in their respective 128 bit lanes. 6555 movl(rtmp, 16); 6556 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6557 6558 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6559 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6560 // original shuffle indices and move the shuffled lanes corresponding to true 6561 // mask to destination vector. 6562 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6563 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6564 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6565 6566 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6567 // and broadcasting second 128 bit lane. 6568 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6569 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6570 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6571 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6572 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6573 6574 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6575 // and broadcasting third 128 bit lane. 6576 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6577 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6578 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6579 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6580 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6581 6582 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6583 // and broadcasting third 128 bit lane. 6584 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6585 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6586 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6587 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6588 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6589 } 6590 6591 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6592 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6593 if (vlen_enc == AVX_128bit) { 6594 vpermilps(dst, src, shuffle, vlen_enc); 6595 } else if (bt == T_INT) { 6596 vpermd(dst, shuffle, src, vlen_enc); 6597 } else { 6598 assert(bt == T_FLOAT, ""); 6599 vpermps(dst, shuffle, src, vlen_enc); 6600 } 6601 } 6602 6603 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 6604 switch(opcode) { 6605 case Op_AddHF: vaddsh(dst, src1, src2); break; 6606 case Op_SubHF: vsubsh(dst, src1, src2); break; 6607 case Op_MulHF: vmulsh(dst, src1, src2); break; 6608 case Op_DivHF: vdivsh(dst, src1, src2); break; 6609 default: assert(false, "%s", NodeClassNames[opcode]); break; 6610 } 6611 } 6612 6613 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6614 switch(elem_bt) { 6615 case T_BYTE: 6616 if (ideal_opc == Op_SaturatingAddV) { 6617 vpaddsb(dst, src1, src2, vlen_enc); 6618 } else { 6619 assert(ideal_opc == Op_SaturatingSubV, ""); 6620 vpsubsb(dst, src1, src2, vlen_enc); 6621 } 6622 break; 6623 case T_SHORT: 6624 if (ideal_opc == Op_SaturatingAddV) { 6625 vpaddsw(dst, src1, src2, vlen_enc); 6626 } else { 6627 assert(ideal_opc == Op_SaturatingSubV, ""); 6628 vpsubsw(dst, src1, src2, vlen_enc); 6629 } 6630 break; 6631 default: 6632 fatal("Unsupported type %s", type2name(elem_bt)); 6633 break; 6634 } 6635 } 6636 6637 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6638 switch(elem_bt) { 6639 case T_BYTE: 6640 if (ideal_opc == Op_SaturatingAddV) { 6641 vpaddusb(dst, src1, src2, vlen_enc); 6642 } else { 6643 assert(ideal_opc == Op_SaturatingSubV, ""); 6644 vpsubusb(dst, src1, src2, vlen_enc); 6645 } 6646 break; 6647 case T_SHORT: 6648 if (ideal_opc == Op_SaturatingAddV) { 6649 vpaddusw(dst, src1, src2, vlen_enc); 6650 } else { 6651 assert(ideal_opc == Op_SaturatingSubV, ""); 6652 vpsubusw(dst, src1, src2, vlen_enc); 6653 } 6654 break; 6655 default: 6656 fatal("Unsupported type %s", type2name(elem_bt)); 6657 break; 6658 } 6659 } 6660 6661 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6662 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6663 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6664 // overflow_mask = Inp1 <u Inp2 6665 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6666 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6667 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6668 } 6669 6670 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6671 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6672 // Emulate unsigned comparison using signed comparison 6673 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6674 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6675 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6676 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6677 6678 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6679 6680 // Res = INP1 - INP2 (non-commutative and non-associative) 6681 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6682 // Res = Mask ? Zero : Res 6683 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6684 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6685 } 6686 6687 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6688 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6689 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6690 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6691 // Res = Signed Add INP1, INP2 6692 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6693 // T1 = SRC1 | SRC2 6694 vpor(xtmp1, src1, src2, vlen_enc); 6695 // Max_Unsigned = -1 6696 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6697 // Unsigned compare: Mask = Res <u T1 6698 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6699 // res = Mask ? Max_Unsigned : Res 6700 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6701 } 6702 6703 // 6704 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6705 // unsigned addition operation. 6706 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6707 // 6708 // We empirically determined its semantic equivalence to following reduced expression 6709 // overflow_mask = (a + b) <u (a | b) 6710 // 6711 // and also verified it though Alive2 solver. 6712 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6713 // 6714 6715 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6716 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6717 // Res = Signed Add INP1, INP2 6718 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6719 // Compute T1 = INP1 | INP2 6720 vpor(xtmp3, src1, src2, vlen_enc); 6721 // T1 = Minimum signed value. 6722 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6723 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6724 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6725 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6726 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6727 // Compute overflow detection mask = Res<1> <s T1 6728 if (elem_bt == T_INT) { 6729 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6730 } else { 6731 assert(elem_bt == T_LONG, ""); 6732 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6733 } 6734 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6735 } 6736 6737 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6738 int vlen_enc, bool xtmp2_hold_M1) { 6739 if (VM_Version::supports_avx512dq()) { 6740 evpmovq2m(ktmp, src, vlen_enc); 6741 } else { 6742 assert(VM_Version::supports_evex(), ""); 6743 if (!xtmp2_hold_M1) { 6744 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6745 } 6746 evpsraq(xtmp1, src, 63, vlen_enc); 6747 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6748 } 6749 } 6750 6751 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6752 int vlen_enc, bool xtmp2_hold_M1) { 6753 if (VM_Version::supports_avx512dq()) { 6754 evpmovd2m(ktmp, src, vlen_enc); 6755 } else { 6756 assert(VM_Version::supports_evex(), ""); 6757 if (!xtmp2_hold_M1) { 6758 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6759 } 6760 vpsrad(xtmp1, src, 31, vlen_enc); 6761 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6762 } 6763 } 6764 6765 6766 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6767 if (elem_bt == T_LONG) { 6768 if (VM_Version::supports_evex()) { 6769 evpsraq(dst, src, 63, vlen_enc); 6770 } else { 6771 vpsrad(dst, src, 31, vlen_enc); 6772 vpshufd(dst, dst, 0xF5, vlen_enc); 6773 } 6774 } else { 6775 assert(elem_bt == T_INT, ""); 6776 vpsrad(dst, src, 31, vlen_enc); 6777 } 6778 } 6779 6780 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6781 if (compute_allones) { 6782 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6783 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6784 } else { 6785 vpcmpeqq(allones, allones, allones, vlen_enc); 6786 } 6787 } 6788 if (elem_bt == T_LONG) { 6789 vpsrlq(dst, allones, 1, vlen_enc); 6790 } else { 6791 assert(elem_bt == T_INT, ""); 6792 vpsrld(dst, allones, 1, vlen_enc); 6793 } 6794 } 6795 6796 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6797 if (compute_allones) { 6798 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6799 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6800 } else { 6801 vpcmpeqq(allones, allones, allones, vlen_enc); 6802 } 6803 } 6804 if (elem_bt == T_LONG) { 6805 vpsllq(dst, allones, 63, vlen_enc); 6806 } else { 6807 assert(elem_bt == T_INT, ""); 6808 vpslld(dst, allones, 31, vlen_enc); 6809 } 6810 } 6811 6812 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6813 Assembler::ComparisonPredicate cond, int vlen_enc) { 6814 switch(elem_bt) { 6815 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6816 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6817 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6818 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6819 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6820 } 6821 } 6822 6823 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6824 switch(elem_bt) { 6825 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6826 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6827 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6828 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6829 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6830 } 6831 } 6832 6833 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6834 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6835 if (elem_bt == T_LONG) { 6836 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6837 } else { 6838 assert(elem_bt == T_INT, ""); 6839 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6840 } 6841 } 6842 6843 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6844 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6845 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6846 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6847 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6848 // Overflow detection based on Hacker's delight section 2-13. 6849 if (ideal_opc == Op_SaturatingAddV) { 6850 // res = src1 + src2 6851 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6852 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6853 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6854 vpxor(xtmp1, dst, src1, vlen_enc); 6855 vpxor(xtmp2, dst, src2, vlen_enc); 6856 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6857 } else { 6858 assert(ideal_opc == Op_SaturatingSubV, ""); 6859 // res = src1 - src2 6860 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6861 // Overflow occurs when both inputs have opposite polarity and 6862 // result polarity does not comply with first input polarity. 6863 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6864 vpxor(xtmp1, src1, src2, vlen_enc); 6865 vpxor(xtmp2, dst, src1, vlen_enc); 6866 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6867 } 6868 6869 // Compute overflow detection mask. 6870 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6871 // Note: xtmp1 hold -1 in all its lanes after above call. 6872 6873 // Compute mask based on first input polarity. 6874 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6875 6876 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6877 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6878 6879 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6880 // set bits in first input polarity mask holds a min value. 6881 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6882 // Blend destination lanes with saturated values using overflow detection mask. 6883 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6884 } 6885 6886 6887 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6888 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6889 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 6890 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6891 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6892 // Overflow detection based on Hacker's delight section 2-13. 6893 if (ideal_opc == Op_SaturatingAddV) { 6894 // res = src1 + src2 6895 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6896 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6897 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6898 vpxor(xtmp1, dst, src1, vlen_enc); 6899 vpxor(xtmp2, dst, src2, vlen_enc); 6900 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6901 } else { 6902 assert(ideal_opc == Op_SaturatingSubV, ""); 6903 // res = src1 - src2 6904 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6905 // Overflow occurs when both inputs have opposite polarity and 6906 // result polarity does not comply with first input polarity. 6907 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6908 vpxor(xtmp1, src1, src2, vlen_enc); 6909 vpxor(xtmp2, dst, src1, vlen_enc); 6910 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6911 } 6912 6913 // Sign-extend to compute overflow detection mask. 6914 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 6915 6916 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 6917 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 6918 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6919 6920 // Compose saturating min/max vector using first input polarity mask. 6921 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 6922 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 6923 6924 // Blend result with saturating vector using overflow detection mask. 6925 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6926 } 6927 6928 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6929 switch(elem_bt) { 6930 case T_BYTE: 6931 if (ideal_opc == Op_SaturatingAddV) { 6932 vpaddsb(dst, src1, src2, vlen_enc); 6933 } else { 6934 assert(ideal_opc == Op_SaturatingSubV, ""); 6935 vpsubsb(dst, src1, src2, vlen_enc); 6936 } 6937 break; 6938 case T_SHORT: 6939 if (ideal_opc == Op_SaturatingAddV) { 6940 vpaddsw(dst, src1, src2, vlen_enc); 6941 } else { 6942 assert(ideal_opc == Op_SaturatingSubV, ""); 6943 vpsubsw(dst, src1, src2, vlen_enc); 6944 } 6945 break; 6946 default: 6947 fatal("Unsupported type %s", type2name(elem_bt)); 6948 break; 6949 } 6950 } 6951 6952 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6953 switch(elem_bt) { 6954 case T_BYTE: 6955 if (ideal_opc == Op_SaturatingAddV) { 6956 vpaddusb(dst, src1, src2, vlen_enc); 6957 } else { 6958 assert(ideal_opc == Op_SaturatingSubV, ""); 6959 vpsubusb(dst, src1, src2, vlen_enc); 6960 } 6961 break; 6962 case T_SHORT: 6963 if (ideal_opc == Op_SaturatingAddV) { 6964 vpaddusw(dst, src1, src2, vlen_enc); 6965 } else { 6966 assert(ideal_opc == Op_SaturatingSubV, ""); 6967 vpsubusw(dst, src1, src2, vlen_enc); 6968 } 6969 break; 6970 default: 6971 fatal("Unsupported type %s", type2name(elem_bt)); 6972 break; 6973 } 6974 } 6975 6976 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6977 XMMRegister src2, int vlen_enc) { 6978 switch(elem_bt) { 6979 case T_BYTE: 6980 evpermi2b(dst, src1, src2, vlen_enc); 6981 break; 6982 case T_SHORT: 6983 evpermi2w(dst, src1, src2, vlen_enc); 6984 break; 6985 case T_INT: 6986 evpermi2d(dst, src1, src2, vlen_enc); 6987 break; 6988 case T_LONG: 6989 evpermi2q(dst, src1, src2, vlen_enc); 6990 break; 6991 case T_FLOAT: 6992 evpermi2ps(dst, src1, src2, vlen_enc); 6993 break; 6994 case T_DOUBLE: 6995 evpermi2pd(dst, src1, src2, vlen_enc); 6996 break; 6997 default: 6998 fatal("Unsupported type %s", type2name(elem_bt)); 6999 break; 7000 } 7001 } 7002 7003 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 7004 if (is_unsigned) { 7005 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7006 } else { 7007 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7008 } 7009 } 7010 7011 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 7012 if (is_unsigned) { 7013 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7014 } else { 7015 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7016 } 7017 } 7018 7019 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 7020 switch(opcode) { 7021 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7022 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7023 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7024 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7025 default: assert(false, "%s", NodeClassNames[opcode]); break; 7026 } 7027 } 7028 7029 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7030 switch(opcode) { 7031 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7032 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7033 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7034 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7035 default: assert(false, "%s", NodeClassNames[opcode]); break; 7036 } 7037 } 7038 7039 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7040 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) { 7041 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit); 7042 } 7043 7044 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7045 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 7046 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) { 7047 // Move sign bits of src2 to mask register. 7048 evpmovw2m(ktmp, src2, vlen_enc); 7049 // xtmp1 = src2 < 0 ? src2 : src1 7050 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7051 // xtmp2 = src2 < 0 ? ? src1 : src2 7052 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7053 // Idea behind above swapping is to make seconds source operand a +ve value. 7054 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in 7055 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction, 7056 // the second source operand, either a NaN or a valid floating-point value, is returned 7057 // dst = max(xtmp1, xtmp2) 7058 evmaxph(dst, xtmp1, xtmp2, vlen_enc); 7059 // isNaN = is_unordered_quiet(xtmp1) 7060 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7061 // Final result is same as first source if its a NaN value, 7062 // in case second operand holds a NaN value then as per above semantics 7063 // result is same as second operand. 7064 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7065 } else { 7066 assert(opcode == Op_MinVHF || opcode == Op_MinHF, ""); 7067 // Move sign bits of src1 to mask register. 7068 evpmovw2m(ktmp, src1, vlen_enc); 7069 // xtmp1 = src1 < 0 ? src2 : src1 7070 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7071 // xtmp2 = src1 < 0 ? src1 : src2 7072 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7073 // Idea behind above swapping is to make seconds source operand a -ve value. 7074 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in 7075 // the second source operand is returned. 7076 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN 7077 // or a valid floating-point value, is written to the result. 7078 // dst = min(xtmp1, xtmp2) 7079 evminph(dst, xtmp1, xtmp2, vlen_enc); 7080 // isNaN = is_unordered_quiet(xtmp1) 7081 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7082 // Final result is same as first source if its a NaN value, 7083 // in case second operand holds a NaN value then as per above semantics 7084 // result is same as second operand. 7085 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7086 } 7087 } --- EOF ---