< prev index next >

src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp

Print this page

 394     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 395   }
 396 }
 397 
 398 // Use RTM for inflating locks
 399 // inputs: objReg (object to lock)
 400 //         boxReg (on-stack box address (displaced header location) - KILLED)
 401 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 402 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 403                                             Register scrReg, Register retry_on_busy_count_Reg,
 404                                             Register retry_on_abort_count_Reg,
 405                                             RTMLockingCounters* rtm_counters,
 406                                             Metadata* method_data, bool profile_rtm,
 407                                             Label& DONE_LABEL) {
 408   assert(UseRTMLocking, "why call this otherwise?");
 409   assert(tmpReg == rax, "");
 410   assert(scrReg == rdx, "");
 411   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 412   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 413 
 414   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 415   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 416 
 417   if (RTMRetryCount > 0) {
 418     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 419     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 420     bind(L_rtm_retry);
 421   }
 422   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 423     Label L_noincrement;
 424     if (RTMTotalCountIncrRate > 1) {
 425       // tmpReg, scrReg and flags are killed
 426       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 427     }
 428     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 429     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 430     bind(L_noincrement);
 431   }
 432   xbegin(L_on_abort);
 433   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 434   movptr(tmpReg, Address(tmpReg, owner_offset));

 540 //    But beware of excessive branch density on AMD Opterons.
 541 //
 542 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 543 //    or failure of the fast path.  If the fast path fails then we pass
 544 //    control to the slow path, typically in C.  In fast_lock and
 545 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 546 //    will emit a conditional branch immediately after the node.
 547 //    So we have branches to branches and lots of ICC.ZF games.
 548 //    Instead, it might be better to have C2 pass a "FailureLabel"
 549 //    into fast_lock and fast_unlock.  In the case of success, control
 550 //    will drop through the node.  ICC.ZF is undefined at exit.
 551 //    In the case of failure, the node will branch directly to the
 552 //    FailureLabel
 553 
 554 
 555 // obj: object to lock
 556 // box: on-stack box address (displaced header location) - KILLED
 557 // rax,: tmp -- KILLED
 558 // scr: tmp -- KILLED
 559 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 560                                  Register scrReg, Register cx1Reg, Register cx2Reg,
 561                                  RTMLockingCounters* rtm_counters,
 562                                  RTMLockingCounters* stack_rtm_counters,
 563                                  Metadata* method_data,
 564                                  bool use_rtm, bool profile_rtm) {
 565   // Ensure the register assignments are disjoint
 566   assert(tmpReg == rax, "");
 567 
 568   if (use_rtm) {
 569     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 570   } else {
 571     assert(cx2Reg == noreg, "");
 572     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 573   }
 574 
 575   // Possible cases that we'll encounter in fast_lock
 576   // ------------------------------------------------
 577   // * Inflated
 578   //    -- unlocked
 579   //    -- Locked
 580   //       = by self
 581   //       = by other
 582   // * neutral
 583   // * stack-locked
 584   //    -- by self
 585   //       = sp-proximity test hits
 586   //       = sp-proximity test generates false-negative
 587   //    -- by other
 588   //
 589 
 590   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 591 
 592   if (DiagnoseSyncOnValueBasedClasses != 0) {
 593     load_klass(tmpReg, objReg, cx1Reg);
 594     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 595     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 596     jcc(Assembler::notZero, DONE_LABEL);
 597   }
 598 
 599 #if INCLUDE_RTM_OPT
 600   if (UseRTMForStackLocks && use_rtm) {
 601     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 602     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 603                       stack_rtm_counters, method_data, profile_rtm,
 604                       DONE_LABEL, IsInflated);
 605   }
 606 #endif // INCLUDE_RTM_OPT
 607 
 608   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 609   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 610   jccb(Assembler::notZero, IsInflated);
 611 
 612   if (!UseHeavyMonitors) {
 613     // Attempt stack-locking ...
 614     orptr (tmpReg, markWord::unlocked_value);
 615     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 616     lock();
 617     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 618     jcc(Assembler::equal, COUNT);           // Success
 619 
 620     // Recursive locking.
 621     // The object is stack-locked: markword contains stack pointer to BasicLock.
 622     // Locked by current thread if difference with current SP is less than one page.
 623     subptr(tmpReg, rsp);
 624     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 625     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 626     movptr(Address(boxReg, 0), tmpReg);
 627   } else {
 628     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 629     testptr(objReg, objReg);
 630   }



 631   jmp(DONE_LABEL);
 632 
 633   bind(IsInflated);
 634   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 635 
 636 #if INCLUDE_RTM_OPT
 637   // Use the same RTM locking code in 32- and 64-bit VM.
 638   if (use_rtm) {
 639     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 640                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 641   } else {
 642 #endif // INCLUDE_RTM_OPT
 643 
 644 #ifndef _LP64
 645   // The object is inflated.
 646 
 647   // boxReg refers to the on-stack BasicLock in the current frame.
 648   // We'd like to write:
 649   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 650   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 651   // additional latency as we have another ST in the store buffer that must drain.
 652 
 653   // avoid ST-before-CAS
 654   // register juggle because we need tmpReg for cmpxchgptr below
 655   movptr(scrReg, boxReg);
 656   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 657 
 658   // Optimistic form: consider XORL tmpReg,tmpReg
 659   movptr(tmpReg, NULL_WORD);
 660 
 661   // Appears unlocked - try to swing _owner from null to non-null.
 662   // Ideally, I'd manifest "Self" with get_thread and then attempt
 663   // to CAS the register containing Self into m->Owner.
 664   // But we don't have enough registers, so instead we can either try to CAS
 665   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 666   // we later store "Self" into m->Owner.  Transiently storing a stack address
 667   // (rsp or the address of the box) into  m->owner is harmless.
 668   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 669   lock();
 670   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 671   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 672   // If we weren't able to swing _owner from NULL to the BasicLock
 673   // then take the slow path.
 674   jccb  (Assembler::notZero, NO_COUNT);
 675   // update _owner from BasicLock to thread
 676   get_thread (scrReg);                    // beware: clobbers ICCs
 677   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 678   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 679 
 680   // If the CAS fails we can either retry or pass control to the slow path.
 681   // We use the latter tactic.
 682   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 683   // If the CAS was successful ...
 684   //   Self has acquired the lock
 685   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 686   // Intentional fall-through into DONE_LABEL ...
 687 #else // _LP64
 688   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 689   movq(scrReg, tmpReg);
 690   xorq(tmpReg, tmpReg);
 691   lock();
 692   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 693   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 694   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 695   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 696   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 697   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 698 
 699   cmpptr(r15_thread, rax);                // Check if we are already the owner (recursive lock)
 700   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 701   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 702   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 703 #endif // _LP64
 704 #if INCLUDE_RTM_OPT
 705   } // use_rtm()
 706 #endif
 707   // DONE_LABEL is a hot target - we'd really like to place it at the
 708   // start of cache line by padding with NOPs.
 709   // See the AMD and Intel software optimization manuals for the
 710   // most efficient "long" NOP encodings.
 711   // Unfortunately none of our alignment mechanisms suffice.
 712   bind(DONE_LABEL);
 713 
 714   // ZFlag == 1 count in fast path
 715   // ZFlag == 0 count in slow path
 716   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 717 
 718   bind(COUNT);
 719   // Count monitors in fast path

 769 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 770   assert(boxReg == rax, "");
 771   assert_different_registers(objReg, boxReg, tmpReg);
 772 
 773   Label DONE_LABEL, Stacked, CheckSucc, COUNT, NO_COUNT;
 774 
 775 #if INCLUDE_RTM_OPT
 776   if (UseRTMForStackLocks && use_rtm) {
 777     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 778     Label L_regular_unlock;
 779     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 780     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 781     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 782     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 783     xend();                                                           // otherwise end...
 784     jmp(DONE_LABEL);                                                  // ... and we're done
 785     bind(L_regular_unlock);
 786   }
 787 #endif
 788 

 789   if (!UseHeavyMonitors) {
 790     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 791     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 792   }
 793   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 794   if (!UseHeavyMonitors) {
 795     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 796     jccb   (Assembler::zero, Stacked);



 797   }
 798 
 799   // It's inflated.
 800 #if INCLUDE_RTM_OPT
 801   if (use_rtm) {
 802     Label L_regular_inflated_unlock;
 803     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 804     movptr(boxReg, Address(tmpReg, owner_offset));
 805     testptr(boxReg, boxReg);
 806     jccb(Assembler::notZero, L_regular_inflated_unlock);
 807     xend();
 808     jmpb(DONE_LABEL);
 809     bind(L_regular_inflated_unlock);
 810   }
 811 #endif
 812 
 813   // Despite our balanced locking property we still check that m->_owner == Self
 814   // as java routines or native JNI code called by this thread might
 815   // have released the lock.
 816   // Refer to the comments in synchronizer.cpp for how we might encode extra
 817   // state in _succ so we can avoid fetching EntryList|cxq.
 818   //
 819   // If there's no contention try a 1-0 exit.  That is, exit without
 820   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 821   // we detect and recover from the race that the 1-0 exit admits.
 822   //
 823   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 824   // before it STs null into _owner, releasing the lock.  Updates
 825   // to data protected by the critical section must be visible before
 826   // we drop the lock (and thus before any other thread could acquire
 827   // the lock and observe the fields protected by the lock).
 828   // IA32's memory-model is SPO, so STs are ordered with respect to
 829   // each other and there's no need for an explicit barrier (fence).
 830   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 831 #ifndef _LP64
 832   get_thread (boxReg);
 833 
 834   // Note that we could employ various encoding schemes to reduce
 835   // the number of loads below (currently 4) to just 2 or 3.
 836   // Refer to the comments in synchronizer.cpp.
 837   // In practice the chain of fetches doesn't seem to impact performance, however.
 838   xorptr(boxReg, boxReg);
 839   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 840   jccb  (Assembler::notZero, DONE_LABEL);
 841   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 842   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 843   jccb  (Assembler::notZero, CheckSucc);
 844   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 845   jmpb  (DONE_LABEL);
 846 
 847   bind (Stacked);
 848   // It's not inflated and it's not recursively stack-locked.
 849   // It must be stack-locked.
 850   // Try to reset the header to displaced header.
 851   // The "box" value on the stack is stable, so we can reload
 852   // and be assured we observe the same value as above.
 853   movptr(tmpReg, Address(boxReg, 0));
 854   lock();
 855   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 856   // Intention fall-thru into DONE_LABEL
 857 
 858   // DONE_LABEL is a hot target - we'd really like to place it at the
 859   // start of cache line by padding with NOPs.
 860   // See the AMD and Intel software optimization manuals for the
 861   // most efficient "long" NOP encodings.
 862   // Unfortunately none of our alignment mechanisms suffice.
 863   bind (CheckSucc);
 864 #else // _LP64
 865   // It's inflated
 866   Label LNotRecursive, LSuccess, LGoSlowPath;
 867 
 868   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 869   jccb(Assembler::equal, LNotRecursive);
 870 
 871   // Recursive inflated unlock
 872   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 873   jmpb(LSuccess);
 874 
 875   bind(LNotRecursive);
 876   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 877   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 878   jccb  (Assembler::notZero, CheckSucc);
 879   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 880   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 881   jmpb  (DONE_LABEL);
 882 
 883   // Try to avoid passing control into the slow_path ...
 884   bind  (CheckSucc);
 885 
 886   // The following optional optimization can be elided if necessary
 887   // Effectively: if (succ == null) goto slow path
 888   // The code reduces the window for a race, however,
 889   // and thus benefits performance.
 890   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 891   jccb  (Assembler::zero, LGoSlowPath);
 892 
 893   xorptr(boxReg, boxReg);
 894   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 895   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 896 
 897   // Memory barrier/fence
 898   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 899   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 900   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 901   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 902   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 903   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 904   lock(); addl(Address(rsp, 0), 0);
 905 
 906   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 907   jccb  (Assembler::notZero, LSuccess);
 908 



 909   // Rare inopportune interleaving - race.
 910   // The successor vanished in the small window above.
 911   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 912   // We need to ensure progress and succession.
 913   // Try to reacquire the lock.
 914   // If that fails then the new owner is responsible for succession and this
 915   // thread needs to take no further action and can exit via the fast path (success).
 916   // If the re-acquire succeeds then pass control into the slow path.
 917   // As implemented, this latter mode is horrible because we generated more
 918   // coherence traffic on the lock *and* artificially extended the critical section
 919   // length while by virtue of passing control into the slow path.
 920 
 921   // box is really RAX -- the following CMPXCHG depends on that binding
 922   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 923   lock();
 924   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 925   // There's no successor so we tried to regrab the lock.
 926   // If that didn't work, then another thread grabbed the
 927   // lock so we're done (and exit was a success).
 928   jccb  (Assembler::notEqual, LSuccess);
 929   // Intentional fall-through into slow path
 930 
 931   bind  (LGoSlowPath);
 932   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 933   jmpb  (DONE_LABEL);
 934 
 935   bind  (LSuccess);
 936   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 937   jmpb  (DONE_LABEL);
 938 

 939   if (!UseHeavyMonitors) {
 940     bind  (Stacked);
 941     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 942     lock();
 943     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 944   }
 945 #endif
 946   bind(DONE_LABEL);
 947 
 948   // ZFlag == 1 count in fast path
 949   // ZFlag == 0 count in slow path
 950   jccb(Assembler::notZero, NO_COUNT);
 951 
 952   bind(COUNT);
 953   // Count monitors in fast path
 954 #ifndef _LP64
 955   get_thread(tmpReg);
 956   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 957 #else // _LP64
 958   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 959 #endif
 960 
 961   xorl(tmpReg, tmpReg); // Set ZF == 1
 962 
 963   bind(NO_COUNT);
 964 }
 965 

 394     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 395   }
 396 }
 397 
 398 // Use RTM for inflating locks
 399 // inputs: objReg (object to lock)
 400 //         boxReg (on-stack box address (displaced header location) - KILLED)
 401 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 402 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 403                                             Register scrReg, Register retry_on_busy_count_Reg,
 404                                             Register retry_on_abort_count_Reg,
 405                                             RTMLockingCounters* rtm_counters,
 406                                             Metadata* method_data, bool profile_rtm,
 407                                             Label& DONE_LABEL) {
 408   assert(UseRTMLocking, "why call this otherwise?");
 409   assert(tmpReg == rax, "");
 410   assert(scrReg == rdx, "");
 411   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 412   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 413 

 414   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 415 
 416   if (RTMRetryCount > 0) {
 417     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 418     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 419     bind(L_rtm_retry);
 420   }
 421   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 422     Label L_noincrement;
 423     if (RTMTotalCountIncrRate > 1) {
 424       // tmpReg, scrReg and flags are killed
 425       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 426     }
 427     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 428     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 429     bind(L_noincrement);
 430   }
 431   xbegin(L_on_abort);
 432   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 433   movptr(tmpReg, Address(tmpReg, owner_offset));

 539 //    But beware of excessive branch density on AMD Opterons.
 540 //
 541 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 542 //    or failure of the fast path.  If the fast path fails then we pass
 543 //    control to the slow path, typically in C.  In fast_lock and
 544 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 545 //    will emit a conditional branch immediately after the node.
 546 //    So we have branches to branches and lots of ICC.ZF games.
 547 //    Instead, it might be better to have C2 pass a "FailureLabel"
 548 //    into fast_lock and fast_unlock.  In the case of success, control
 549 //    will drop through the node.  ICC.ZF is undefined at exit.
 550 //    In the case of failure, the node will branch directly to the
 551 //    FailureLabel
 552 
 553 
 554 // obj: object to lock
 555 // box: on-stack box address (displaced header location) - KILLED
 556 // rax,: tmp -- KILLED
 557 // scr: tmp -- KILLED
 558 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 559                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 560                                  RTMLockingCounters* rtm_counters,
 561                                  RTMLockingCounters* stack_rtm_counters,
 562                                  Metadata* method_data,
 563                                  bool use_rtm, bool profile_rtm) {
 564   // Ensure the register assignments are disjoint
 565   assert(tmpReg == rax, "");
 566 
 567   if (use_rtm) {
 568     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 569   } else {
 570     assert(cx2Reg == noreg, "");
 571     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg);
 572   }
 573 
 574   // Possible cases that we'll encounter in fast_lock
 575   // ------------------------------------------------
 576   // * Inflated
 577   //    -- unlocked
 578   //    -- Locked
 579   //       = by self
 580   //       = by other
 581   // * neutral
 582   // * stack-locked
 583   //    -- by self
 584   //       = sp-proximity test hits
 585   //       = sp-proximity test generates false-negative
 586   //    -- by other
 587   //
 588 
 589   Label IsInflated, DONE_LABEL, slow_path, NO_COUNT, COUNT;
 590 
 591   if (DiagnoseSyncOnValueBasedClasses != 0) {
 592     load_klass(tmpReg, objReg, cx1Reg);
 593     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 594     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 595     jcc(Assembler::notZero, DONE_LABEL);
 596   }
 597 
 598 #if INCLUDE_RTM_OPT
 599   if (UseRTMForStackLocks && use_rtm) {
 600     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 601     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 602                       stack_rtm_counters, method_data, profile_rtm,
 603                       DONE_LABEL, IsInflated);
 604   }
 605 #endif // INCLUDE_RTM_OPT
 606 
 607   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 608   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 609   jccb(Assembler::notZero, IsInflated);
 610 
 611   if (!UseHeavyMonitors) {
 612     fast_lock_impl(objReg, tmpReg, thread, scrReg, cx1Reg, slow_path);
 613     xorptr(rax, rax); // Set ZF = 1 (success)
 614     jmp(COUNT);














 615   }
 616   bind(slow_path);
 617   // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 618   testptr(objReg, objReg);
 619   jmp(DONE_LABEL);
 620 
 621   bind(IsInflated);
 622   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 623 
 624 #if INCLUDE_RTM_OPT
 625   // Use the same RTM locking code in 32- and 64-bit VM.
 626   if (use_rtm) {
 627     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 628                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 629   } else {
 630 #endif // INCLUDE_RTM_OPT
 631 
 632 #ifndef _LP64
 633   // The object is inflated.
 634 
 635   // boxReg refers to the on-stack BasicLock in the current frame.
 636   // We'd like to write:
 637   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 638   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 639   // additional latency as we have another ST in the store buffer that must drain.
 640 
 641   // avoid ST-before-CAS
 642   // register juggle because we need tmpReg for cmpxchgptr below
 643   movptr(scrReg, boxReg);
 644   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 645 
 646   // Optimistic form: consider XORL tmpReg,tmpReg
 647   movptr(tmpReg, NULL_WORD);
 648 
 649   // Appears unlocked - try to swing _owner from null to non-null.
 650   // Ideally, I'd manifest "Self" with get_thread and then attempt
 651   // to CAS the register containing Self into m->Owner.
 652   // But we don't have enough registers, so instead we can either try to CAS
 653   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 654   // we later store "Self" into m->Owner.  Transiently storing a stack address
 655   // (rsp or the address of the box) into  m->owner is harmless.
 656   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 657   lock();
 658   cmpxchgptr(thread, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 659   // If we weren't able to swing _owner from NULL to the thread

 660   // then take the slow path.
 661   jccb  (Assembler::notZero, NO_COUNT);



 662   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 663 
 664   // If the CAS fails we can either retry or pass control to the slow path.
 665   // We use the latter tactic.
 666   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 667   // If the CAS was successful ...
 668   //   Self has acquired the lock
 669   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 670   // Intentional fall-through into DONE_LABEL ...
 671 #else // _LP64
 672   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 673   movq(scrReg, tmpReg);
 674   xorq(tmpReg, tmpReg);
 675   lock();
 676   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));



 677   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 678   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 679 
 680   cmpptr(thread, rax);                     // Check if we are already the owner (recursive lock)
 681   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 682   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 683   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 684 #endif // _LP64
 685 #if INCLUDE_RTM_OPT
 686   } // use_rtm()
 687 #endif
 688   // DONE_LABEL is a hot target - we'd really like to place it at the
 689   // start of cache line by padding with NOPs.
 690   // See the AMD and Intel software optimization manuals for the
 691   // most efficient "long" NOP encodings.
 692   // Unfortunately none of our alignment mechanisms suffice.
 693   bind(DONE_LABEL);
 694 
 695   // ZFlag == 1 count in fast path
 696   // ZFlag == 0 count in slow path
 697   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 698 
 699   bind(COUNT);
 700   // Count monitors in fast path

 750 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 751   assert(boxReg == rax, "");
 752   assert_different_registers(objReg, boxReg, tmpReg);
 753 
 754   Label DONE_LABEL, Stacked, CheckSucc, COUNT, NO_COUNT;
 755 
 756 #if INCLUDE_RTM_OPT
 757   if (UseRTMForStackLocks && use_rtm) {
 758     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 759     Label L_regular_unlock;
 760     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 761     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 762     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 763     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 764     xend();                                                           // otherwise end...
 765     jmp(DONE_LABEL);                                                  // ... and we're done
 766     bind(L_regular_unlock);
 767   }
 768 #endif
 769 
 770   movptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
 771   if (!UseHeavyMonitors) {
 772     testptr(boxReg, markWord::monitor_value);
 773     jcc(Assembler::zero, Stacked);
 774 
 775     // If the owner is ANONYMOUS, we need to fix it - in the slow-path.
 776     Label L;
 777     cmpptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) (intptr_t) ANONYMOUS_OWNER);
 778     jccb(Assembler::notEqual, L);
 779     testptr(objReg, objReg); // Clear ZF to indicate failure at DONE_LABEL.
 780     jmp(DONE_LABEL);
 781     bind(L);
 782   }
 783 
 784   // It's inflated.
 785 #if INCLUDE_RTM_OPT
 786   if (use_rtm) {
 787     Label L_regular_inflated_unlock;
 788     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 789     movptr(tmpReg, Address(boxReg, owner_offset));
 790     testptr(tmpReg, tmpReg);
 791     jccb(Assembler::notZero, L_regular_inflated_unlock);
 792     xend();
 793     jmp(DONE_LABEL);
 794     bind(L_regular_inflated_unlock);
 795   }
 796 #endif
 797 
 798   // Despite our balanced locking property we still check that m->_owner == Self
 799   // as java routines or native JNI code called by this thread might
 800   // have released the lock.
 801   // Refer to the comments in synchronizer.cpp for how we might encode extra
 802   // state in _succ so we can avoid fetching EntryList|cxq.
 803   //
 804   // If there's no contention try a 1-0 exit.  That is, exit without
 805   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 806   // we detect and recover from the race that the 1-0 exit admits.
 807   //
 808   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 809   // before it STs null into _owner, releasing the lock.  Updates
 810   // to data protected by the critical section must be visible before
 811   // we drop the lock (and thus before any other thread could acquire
 812   // the lock and observe the fields protected by the lock).
 813   // IA32's memory-model is SPO, so STs are ordered with respect to
 814   // each other and there's no need for an explicit barrier (fence).
 815   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 816 #ifndef _LP64


 817   // Note that we could employ various encoding schemes to reduce
 818   // the number of loads below (currently 4) to just 2 or 3.
 819   // Refer to the comments in synchronizer.cpp.
 820   // In practice the chain of fetches doesn't seem to impact performance, however.
 821   xorptr(tmpReg, tmpReg);
 822   orptr(tmpReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 823   jccb  (Assembler::notZero, DONE_LABEL);
 824   movptr(tmpReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 825   orptr(tmpReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 826   jccb  (Assembler::notZero, DONE_LABEL);
 827   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 828   jmpb  (DONE_LABEL);


















 829 #else // _LP64
 830   // It's inflated
 831   Label LNotRecursive, LSuccess, LGoSlowPath;
 832 
 833   cmpptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 834   jccb(Assembler::equal, LNotRecursive);
 835 
 836   // Recursive inflated unlock
 837   decq(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 838   jmpb(LSuccess);
 839 
 840   bind(LNotRecursive);
 841   movptr(tmpReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 842   orptr(tmpReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 843   jccb  (Assembler::notZero, CheckSucc);
 844   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 845   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 846   jmpb  (DONE_LABEL);
 847 
 848   // Try to avoid passing control into the slow_path ...
 849   bind  (CheckSucc);
 850 
 851   // The following optional optimization can be elided if necessary
 852   // Effectively: if (succ == null) goto slow path
 853   // The code reduces the window for a race, however,
 854   // and thus benefits performance.
 855   cmpptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 856   jccb  (Assembler::zero, LGoSlowPath);
 857 

 858   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 859   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 860 
 861   // Memory barrier/fence
 862   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 863   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 864   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 865   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 866   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 867   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 868   lock(); addl(Address(rsp, 0), 0);
 869 
 870   cmpptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 871   jccb  (Assembler::notZero, LSuccess);
 872 
 873   mov(tmpReg, boxReg);
 874   xorptr(boxReg, boxReg);
 875 
 876   // Rare inopportune interleaving - race.
 877   // The successor vanished in the small window above.
 878   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 879   // We need to ensure progress and succession.
 880   // Try to reacquire the lock.
 881   // If that fails then the new owner is responsible for succession and this
 882   // thread needs to take no further action and can exit via the fast path (success).
 883   // If the re-acquire succeeds then pass control into the slow path.
 884   // As implemented, this latter mode is horrible because we generated more
 885   // coherence traffic on the lock *and* artificially extended the critical section
 886   // length while by virtue of passing control into the slow path.
 887 
 888   // box is really RAX -- the following CMPXCHG depends on that binding
 889   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 890   lock();
 891   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 892   // There's no successor so we tried to regrab the lock.
 893   // If that didn't work, then another thread grabbed the
 894   // lock so we're done (and exit was a success).
 895   jccb  (Assembler::notEqual, LSuccess);
 896   // Intentional fall-through into slow path
 897 
 898   bind  (LGoSlowPath);
 899   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 900   jmpb  (DONE_LABEL);
 901 
 902   bind  (LSuccess);
 903   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 904   jmpb  (DONE_LABEL);
 905 
 906 #endif
 907   if (!UseHeavyMonitors) {
 908     bind(Stacked);
 909     // Mark-word must be 00 now, try to swing it back to 01 (unlocked)
 910     fast_unlock_impl(objReg, boxReg, tmpReg, DONE_LABEL);
 911     xorptr(rax, rax); // Set ZF = 1 (success)
 912   }

 913   bind(DONE_LABEL);
 914 
 915   // ZFlag == 1 count in fast path
 916   // ZFlag == 0 count in slow path
 917   jccb(Assembler::notZero, NO_COUNT);
 918 
 919   bind(COUNT);
 920   // Count monitors in fast path
 921 #ifndef _LP64
 922   get_thread(tmpReg);
 923   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 924 #else // _LP64
 925   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 926 #endif
 927 
 928   xorl(tmpReg, tmpReg); // Set ZF == 1
 929 
 930   bind(NO_COUNT);
 931 }
 932 
< prev index next >