/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim: set ts=8 sts=2 et sw=2 tw=80: */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
// There are three kinds of samples done by the profiler. // // - A "periodic" sample is the most complex kind. It is done in response to a // timer while the profiler is active. It involves writing a stack trace plus // a variety of other values (memory measurements, responsiveness // measurements, etc.) into the main ProfileBuffer. The sampling is done from // off-thread, and so SuspendAndSampleAndResumeThread() is used to get the // register values. // // - A "synchronous" sample is a simpler kind. It is done in response to an API // call (profiler_get_backtrace()). It involves writing a stack trace and // little else into a temporary ProfileBuffer, and wrapping that up in a // ProfilerBacktrace that can be subsequently used in a marker. The sampling // is done on-thread, and so REGISTERS_SYNC_POPULATE() is used to get the // register values. // // - A "backtrace" sample is the simplest kind. It is done in response to an // API call (profiler_suspend_and_sample_thread()). It involves getting a // stack trace via a ProfilerStackCollector; it does not write to a // ProfileBuffer. The sampling is done from off-thread, and so uses // SuspendAndSampleAndResumeThread() to get the register values.
// Win32 builds always have frame pointers, so FramePointerStackWalk() always // works. #ifdefined(GP_PLAT_x86_windows) # define HAVE_NATIVE_UNWIND # define USE_FRAME_POINTER_STACK_WALK #endif
// Win64 builds always omit frame pointers, so we use the slower // MozStackWalk(), which works in that case. #ifdefined(GP_PLAT_amd64_windows) # define HAVE_NATIVE_UNWIND # define USE_MOZ_STACK_WALK #endif
// AArch64 Win64 doesn't seem to use frame pointers, so we use the slower // MozStackWalk(). #ifdefined(GP_PLAT_arm64_windows) # define HAVE_NATIVE_UNWIND # define USE_MOZ_STACK_WALK #endif
// Mac builds use FramePointerStackWalk(). Even if we build without // frame pointers, we'll still get useful stacks in system libraries // because those always have frame pointers. // We don't use MozStackWalk() on Mac. #ifdefined(GP_OS_darwin) # define HAVE_NATIVE_UNWIND # define USE_FRAME_POINTER_STACK_WALK #endif
// No stack-walking in baseprofiler on linux, android, bsd. // APIs now make it easier to capture backtraces from the Base Profiler, which // is currently not supported on these platform, and would lead to a MOZ_CRASH // in REGISTERS_SYNC_POPULATE(). `#if 0` added in bug 1658232, follow-up bugs // should be referenced in meta bug 1557568. #if 0 // Android builds use the ARM Exception Handling ABI to unwind. # ifdefined(GP_PLAT_arm_linux) || defined(GP_PLAT_arm_android) # define HAVE_NATIVE_UNWIND # define USE_EHABI_STACKWALK # include "EHABIStackWalk.h" # endif
// Linux/BSD builds use LUL, which uses DWARF info to unwind stacks. # ifdefined(GP_PLAT_amd64_linux) || defined(GP_PLAT_x86_linux) || \ defined(GP_PLAT_amd64_android) || defined(GP_PLAT_x86_android) || \ defined(GP_PLAT_mips64_linux) || defined(GP_PLAT_arm64_linux) || \ defined(GP_PLAT_arm64_android) || defined(GP_PLAT_amd64_freebsd) || \ defined(GP_PLAT_arm64_freebsd) # define HAVE_NATIVE_UNWIND # define USE_LUL_STACKWALK # include "lul/LulMain.h" # include "lul/platform-linux-lul.h"
// On linux we use LUL for periodic samples and synchronous samples, but we use // FramePointerStackWalk for backtrace samples when MOZ_PROFILING is enabled. // (See the comment at the top of the file for a definition of // periodic/synchronous/backtrace.). // // FramePointerStackWalk can produce incomplete stacks when the current entry is // in a shared library without framepointers, however LUL can take a long time // to initialize, which is undesirable for consumers of // profiler_suspend_and_sample_thread like the Background Hang Reporter. # ifdefined(MOZ_PROFILING) # define USE_FRAME_POINTER_STACK_WALK # endif # endif #endif
// We can only stackwalk without expensive initialization on platforms which // support FramePointerStackWalk or MozStackWalk. LUL Stackwalking requires // initializing LUL, and EHABIStackWalk requires initializing EHABI, both of // which can be expensive. #ifdefined(USE_FRAME_POINTER_STACK_WALK) || defined(USE_MOZ_STACK_WALK) # define HAVE_FASTINIT_NATIVE_UNWIND #endif
ProfileChunkedBuffer& profiler_get_core_buffer() { // This needs its own mutex, because it is used concurrently from functions // guarded by gPSMutex as well as others without safety (e.g., // profiler_add_marker). It is *not* used inside the critical section of the // sampler, because mutexes cannot be used there. static ProfileChunkedBuffer sProfileChunkedBuffer{
ProfileChunkedBuffer::ThreadSafety::WithMutex}; return sProfileChunkedBuffer;
}
// Add all the possible features.
BASE_PROFILER_FOR_EACH_FEATURE(ADD_FEATURE)
#undef ADD_FEATURE
// Now remove features not supported on this platform/configuration.
ProfilerFeature::ClearJava(features);
ProfilerFeature::ClearJS(features);
ProfilerFeature::ClearScreenshots(features); #if !defined(HAVE_NATIVE_UNWIND)
ProfilerFeature::ClearStackWalk(features); #endif #if !defined(GP_OS_windows)
ProfilerFeature::ClearNoTimerResolutionChange(features); #endif
return features;
}
// Default features common to all contexts (even if not available). static constexpr uint32_t DefaultFeatures() { return ProfilerFeature::Java | ProfilerFeature::JS |
ProfilerFeature::StackWalk | ProfilerFeature::CPUUtilization |
ProfilerFeature::ProcessCPU;
}
// Extra default features when MOZ_PROFILER_STARTUP is set (even if not // available). static constexpr uint32_t StartupExtraDefaultFeatures() { // Enable mainthreadio by default for startup profiles as startup is heavy on // I/O operations, and main thread I/O is really important to see there. return ProfilerFeature::MainThreadIO | ProfilerFeature::IPCMessages;
}
// The auto-lock/unlock mutex that guards accesses to CorePS and ActivePS. // Use `PSAutoLock lock;` to take the lock until the end of the enclosing block. // External profilers may use this same lock for their own data, but as the lock // is non-recursive, *only* `f(PSLockRef, ...)` functions below should be // called, to avoid double-locking. class MOZ_RAII PSAutoLock { public:
PSAutoLock() : mLock(gPSMutex) {}
// All functions in this file can run on multiple threads unless they have an // NS_IsMainThread() assertion.
// This class contains the profiler's core global state, i.e. that which is // valid even when the profiler is not active. Most profile operations can't do // anything useful when this class is not instantiated, so we release-assert // its non-nullness in all such operations. // // Accesses to CorePS are guarded by gPSMutex. Getters and setters take a // PSAutoLock reference as an argument as proof that the gPSMutex is currently // locked. This makes it clear when gPSMutex is locked and helps avoid // accidental unlocked accesses to global state. There are ways to circumvent // this mechanism, but please don't do so without *very* good reason and a // detailed explanation. // // The exceptions to this rule: // // - mProcessStartTime, because it's immutable; // // - each thread's RacyRegisteredThread object is accessible without locking via // TLSRegisteredThread::RacyRegisteredThread(). class CorePS { private:
CorePS()
: mProcessStartTime(TimeStamp::ProcessCreation()) #ifdef USE_LUL_STACKWALK
,
mLul(nullptr) #endif
{
}
// Unlike ActivePS::Exists(), CorePS::Exists() can be called without gPSMutex // being locked. This is because CorePS is instantiated so early on the main // thread that we don't have to worry about it being racy. staticbool Exists() { return !!sInstance; }
for (auto& registeredThread : sInstance->mRegisteredThreads) {
aProfSize += registeredThread->SizeOfIncludingThis(aMallocSizeOf);
}
for (auto& registeredPage : sInstance->mRegisteredPages) {
aProfSize += registeredPage->SizeOfIncludingThis(aMallocSizeOf);
}
// Measurement of the following things may be added later if DMD finds it // is worthwhile: // - CorePS::mRegisteredThreads itself (its elements' children are // measured above) // - CorePS::mRegisteredPages itself (its elements' children are // measured above) // - CorePS::mInterposeObserver
#ifdefined(USE_LUL_STACKWALK) if (sInstance->mLul) {
aLulSize += sInstance->mLul->SizeOfIncludingThis(aMallocSizeOf);
} #endif
}
// No PSLockRef is needed for this field because it's immutable.
PS_GET_LOCKLESS(const TimeStamp&, ProcessStartTime)
auto foundPageIter = std::find_if(
sInstance->mRegisteredPages.begin(), sInstance->mRegisteredPages.end(),
RegisteredPageComparator{aRegisteredPage.get()});
if (foundPageIter != sInstance->mRegisteredPages.end()) { if ((*foundPageIter)->Url() == "about:blank") { // When a BrowsingContext is loaded, the first url loaded in it will be // about:blank, and if the principal matches, the first document loaded // in it will share an inner window. That's why we should delete the // intermittent about:blank if they share the inner window.
sInstance->mRegisteredPages.erase(foundPageIter);
} else { // Do not register the same page again. return;
}
}
MOZ_RELEASE_ASSERT(
sInstance->mRegisteredPages.append(std::move(aRegisteredPage)));
}
staticvoid RemoveRegisteredPage(PSLockRef,
uint64_t aRegisteredInnerWindowID) {
MOZ_ASSERT(sInstance); // Remove RegisteredPage from mRegisteredPages by given inner window ID.
sInstance->mRegisteredPages.eraseIf([&](const RefPtr<PageInformation>& rd) { return rd->InnerWindowID() == aRegisteredInnerWindowID;
});
}
staticvoid AppendCounter(PSLockRef, BaseProfilerCount* aCounter) {
MOZ_ASSERT(sInstance); // we don't own the counter; they may be stored in static objects
MOZ_RELEASE_ASSERT(sInstance->mCounters.append(aCounter));
}
staticvoid RemoveCounter(PSLockRef, BaseProfilerCount* aCounter) { // we may be called to remove a counter after the profiler is stopped or // late in shutdown. if (sInstance) { auto* counter = std::find(sInstance->mCounters.begin(),
sInstance->mCounters.end(), aCounter);
MOZ_RELEASE_ASSERT(counter != sInstance->mCounters.end());
sInstance->mCounters.erase(counter);
}
}
private: // The singleton instance static CorePS* sInstance;
// The time that the process started. const TimeStamp mProcessStartTime;
// Info on all the registered threads. // ThreadIds in mRegisteredThreads are unique.
Vector<UniquePtr<RegisteredThread>> mRegisteredThreads;
// Info on all the registered pages. // InnerWindowIDs in mRegisteredPages are unique.
Vector<RefPtr<PageInformation>> mRegisteredPages;
// Non-owning pointers to all active counters
Vector<BaseProfilerCount*> mCounters;
#ifdef USE_LUL_STACKWALK // LUL's state. Null prior to the first activation, non-null thereafter.
UniquePtr<lul::LUL> mLul; #endif
// Process name, provided by child process initialization code.
std::string mProcessName; // Private name, provided by child process initialization code (eTLD+1 in // fission)
std::string mETLDplus1;
};
// The buffer size is provided as a number of "entries", this is their size in // bytes.
constexpr static uint32_t scBytesPerEntry = 8;
// This class contains the profiler's global state that is valid only when the // profiler is active. When not instantiated, the profiler is inactive. // // Accesses to ActivePS are guarded by gPSMutex, in much the same fashion as // CorePS. // class ActivePS { private:
constexpr static uint32_t ChunkSizeForEntries(uint32_t aEntries) { return uint32_t(std::min(size_t(ClampToAllowedEntries(aEntries)) *
scBytesPerEntry / scMinimumNumberOfChunks,
size_t(scMaximumChunkSize)));
}
static uint32_t AdjustFeatures(uint32_t aFeatures, uint32_t aFilterCount) { // Filter out any features unavailable in this platform/configuration.
aFeatures &= AvailableFeatures();
// Some features imply others. if (aFeatures & ProfilerFeature::FileIOAll) {
aFeatures |= ProfilerFeature::MainThreadIO | ProfilerFeature::FileIO;
} elseif (aFeatures & ProfilerFeature::FileIO) {
aFeatures |= ProfilerFeature::MainThreadIO;
}
return aFeatures;
}
ActivePS(PSLockRef aLock, const TimeStamp& aProfilingStartTime,
PowerOfTwo32 aCapacity, double aInterval, uint32_t aFeatures, constchar** aFilters, uint32_t aFilterCount, const Maybe<double>& aDuration)
: mProfilingStartTime(aProfilingStartTime),
mGeneration(sNextGeneration++),
mCapacity(aCapacity),
mDuration(aDuration),
mInterval(aInterval),
mFeatures(AdjustFeatures(aFeatures, aFilterCount)),
mProfileBufferChunkManager(
MakeUnique<ProfileBufferChunkManagerWithLocalLimit>(
size_t(ClampToAllowedEntries(aCapacity.Value())) *
scBytesPerEntry,
ChunkSizeForEntries(aCapacity.Value()))),
mProfileBuffer([this]() -> ProfileChunkedBuffer& {
ProfileChunkedBuffer& buffer = profiler_get_core_buffer();
buffer.SetChunkManager(*mProfileBufferChunkManager); return buffer;
}()), // The new sampler thread doesn't start sampling immediately because the // main loop within Run() is blocked until this function's caller // unlocks gPSMutex.
mSamplerThread(
NewSamplerThread(aLock, mGeneration, aInterval, aFeatures)),
mIsPaused(false),
mIsSamplingPaused(false) { // Deep copy and lower-case aFilters.
MOZ_ALWAYS_TRUE(mFilters.resize(aFilterCount));
MOZ_ALWAYS_TRUE(mFiltersLowered.resize(aFilterCount)); for (uint32_t i = 0; i < aFilterCount; ++i) {
mFilters[i] = aFilters[i];
mFiltersLowered[i].reserve(mFilters[i].size());
std::transform(mFilters[i].cbegin(), mFilters[i].cend(),
std::back_inserter(mFiltersLowered[i]), ::tolower);
}
}
~ActivePS() { if (mProfileBufferChunkManager) { // We still control the chunk manager, remove it from the core buffer.
profiler_get_core_buffer().ResetChunkManager();
}
}
bool ThreadSelected(constchar* aThreadName) { if (mFiltersLowered.empty()) { returntrue;
}
std::string name = aThreadName;
std::transform(name.begin(), name.end(), name.begin(), ::tolower);
for (constauto& filter : mFiltersLowered) { if (filter == "*") { returntrue;
}
// Crude, non UTF-8 compatible, case insensitive substring search if (name.find(filter) != std::string::npos) { returntrue;
}
// If the filter is "pid:<my pid>", profile all threads. if (mozilla::profiler::detail::FilterHasPid(filter.c_str())) { returntrue;
}
}
n += sInstance->mProfileBuffer.SizeOfExcludingThis(aMallocSizeOf);
// Measurement of the following members may be added later if DMD finds it // is worthwhile: // - mLiveProfiledThreads (both the array itself, and the contents) // - mDeadProfiledThreads (both the array itself, and the contents) //
// Returns an array containing (RegisteredThread*, ProfiledThreadData*) pairs // for all threads that should be included in a profile, both for threads // that are still registered, and for threads that have been unregistered but // still have data in the buffer. // For threads that have already been unregistered, the RegisteredThread // pointer will be null. // The returned array is sorted by thread register time. // Do not hold on to the return value across thread registration or profiler // restarts. static Vector<std::pair<RegisteredThread*, ProfiledThreadData*>>
ProfiledThreads(PSLockRef) {
MOZ_ASSERT(sInstance);
Vector<std::pair<RegisteredThread*, ProfiledThreadData*>> array;
MOZ_RELEASE_ASSERT(
array.initCapacity(sInstance->mLiveProfiledThreads.length() +
sInstance->mDeadProfiledThreads.length())); for (auto& t : sInstance->mLiveProfiledThreads) {
MOZ_RELEASE_ASSERT(array.append(
std::make_pair(t.mRegisteredThread, t.mProfiledThreadData.get())));
} for (auto& t : sInstance->mDeadProfiledThreads) {
MOZ_RELEASE_ASSERT(
array.append(std::make_pair((RegisteredThread*)nullptr, t.get())));
}
std::sort(array.begin(), array.end(),
[](const std::pair<RegisteredThread*, ProfiledThreadData*>& a, const std::pair<RegisteredThread*, ProfiledThreadData*>& b) { return a.second->Info()->RegisterTime() <
b.second->Info()->RegisterTime();
}); return array;
}
static Vector<RefPtr<PageInformation>> ProfiledPages(PSLockRef aLock) {
MOZ_ASSERT(sInstance);
Vector<RefPtr<PageInformation>> array; for (auto& d : CorePS::RegisteredPages(aLock)) {
MOZ_RELEASE_ASSERT(array.append(d));
} for (auto& d : sInstance->mDeadProfiledPages) {
MOZ_RELEASE_ASSERT(array.append(d));
} // We don't need to sort the pages like threads since we won't show them // as a list. return array;
}
// Do a linear search through mLiveProfiledThreads to find the // ProfiledThreadData object for a RegisteredThread. static ProfiledThreadData* GetProfiledThreadData(
PSLockRef, RegisteredThread* aRegisteredThread) {
MOZ_ASSERT(sInstance); for (const LiveProfiledThreadData& thread :
sInstance->mLiveProfiledThreads) { if (thread.mRegisteredThread == aRegisteredThread) { return thread.mProfiledThreadData.get();
}
} return nullptr;
}
// Find the right entry in the mLiveProfiledThreads array and remove the // element, moving the ProfiledThreadData object for the thread into the // mDeadProfiledThreads array. // The thread's RegisteredThread object gets destroyed here. for (size_t i = 0; i < sInstance->mLiveProfiledThreads.length(); i++) {
LiveProfiledThreadData& thread = sInstance->mLiveProfiledThreads[i]; if (thread.mRegisteredThread == aRegisteredThread) {
thread.mProfiledThreadData->NotifyUnregistered(
sInstance->mProfileBuffer.BufferRangeEnd());
MOZ_RELEASE_ASSERT(sInstance->mDeadProfiledThreads.append(
std::move(thread.mProfiledThreadData)));
sInstance->mLiveProfiledThreads.erase(
&sInstance->mLiveProfiledThreads[i]); return;
}
}
}
PS_GET_AND_SET(bool, IsPaused)
// True if sampling is paused (though generic `SetIsPaused()` or specific // `SetIsSamplingPaused()`). staticbool IsSamplingPaused(PSLockRef lock) {
MOZ_ASSERT(sInstance); return IsPaused(lock) || sInstance->mIsSamplingPaused;
}
private: // The singleton instance. static ActivePS* sInstance;
const TimeStamp mProfilingStartTime;
// We need to track activity generations. If we didn't we could have the // following scenario. // // - profiler_stop() locks gPSMutex, de-instantiates ActivePS, unlocks // gPSMutex, deletes the SamplerThread (which does a join). // // - profiler_start() runs on a different thread, locks gPSMutex, // re-instantiates ActivePS, unlocks gPSMutex -- all before the join // completes. // // - SamplerThread::Run() locks gPSMutex, sees that ActivePS is instantiated, // and continues as if the start/stop pair didn't occur. Also // profiler_stop() is stuck, unable to finish. // // By checking ActivePS *and* the generation, we can avoid this scenario. // sNextGeneration is used to track the next generation number; it is static // because it must persist across different ActivePS instantiations. const uint32_t mGeneration; static uint32_t sNextGeneration;
// The maximum number of 8-byte entries in mProfileBuffer. const PowerOfTwo32 mCapacity;
// The maximum duration of entries in mProfileBuffer, in seconds. const Maybe<double> mDuration;
// The interval between samples, measured in milliseconds. constdouble mInterval;
// The profile features that are enabled. const uint32_t mFeatures;
// Substrings of names of threads we want to profile.
Vector<std::string> mFilters;
Vector<std::string> mFiltersLowered;
// The chunk manager used by `mProfileBuffer` below. // May become null if it gets transferred to the Gecko Profiler.
UniquePtr<ProfileBufferChunkManagerWithLocalLimit> mProfileBufferChunkManager;
// The buffer into which all samples are recorded.
ProfileBuffer mProfileBuffer;
// ProfiledThreadData objects for any threads that were profiled at any point // during this run of the profiler: // - mLiveProfiledThreads contains all threads that are still registered, and // - mDeadProfiledThreads contains all threads that have already been // unregistered but for which there is still data in the profile buffer.
Vector<LiveProfiledThreadData> mLiveProfiledThreads;
Vector<UniquePtr<ProfiledThreadData>> mDeadProfiledThreads;
// Info on all the dead pages. // Registered pages are being moved to this array after unregistration. // We are keeping them in case we need them in the profile data. // We are removing them when we ensure that we won't need them anymore.
Vector<RefPtr<PageInformation>> mDeadProfiledPages;
// The current sampler thread. This class is not responsible for destroying // the SamplerThread object; the Destroy() method returns it so the caller // can destroy it.
SamplerThread* const mSamplerThread;
// Is the profiler fully paused? bool mIsPaused;
// Is the profiler periodic sampling paused? bool mIsSamplingPaused;
/* static */ bool RacyFeatures::IsActiveWithFeature(uint32_t aFeature) {
uint32_t af = sActiveAndFeatures; // copy it first return (af & Active) && (af & aFeature);
}
/* static */ bool RacyFeatures::IsActiveWithoutFeature(uint32_t aFeature) {
uint32_t af = sActiveAndFeatures; // copy it first return (af & Active) && !(af & aFeature);
}
/* static */ bool RacyFeatures::IsActiveAndUnpaused() {
uint32_t af = sActiveAndFeatures; // copy it first return (af & Active) && !(af & Paused);
}
/* static */ bool RacyFeatures::IsActiveAndSamplingUnpaused() {
uint32_t af = sActiveAndFeatures; // copy it first return (af & Active) && !(af & (Paused | SamplingPaused));
}
// Each live thread has a RegisteredThread, and we store a reference to it in // TLS. This class encapsulates that TLS. class TLSRegisteredThread { public: staticbool Init(PSLockRef) { bool ok1 = sRegisteredThread.init(); bool ok2 = AutoProfilerLabel::sProfilingStack.init(); return ok1 && ok2;
}
// Get the entire RegisteredThread. Accesses are guarded by gPSMutex. staticclass RegisteredThread* RegisteredThread(PSLockRef) { return sRegisteredThread.get();
}
// Get only the RacyRegisteredThread. Accesses are not guarded by gPSMutex. staticclass RacyRegisteredThread* RacyRegisteredThread() { class RegisteredThread* registeredThread = sRegisteredThread.get(); return registeredThread ? ®isteredThread->RacyRegisteredThread()
: nullptr;
}
// Get only the ProfilingStack. Accesses are not guarded by gPSMutex. // RacyRegisteredThread() can also be used to get the ProfilingStack, but that // is marginally slower because it requires an extra pointer indirection. static ProfilingStack* Stack() { return AutoProfilerLabel::sProfilingStack.get();
}
private: // This is a non-owning reference to the RegisteredThread; // CorePS::mRegisteredThreads is the owning reference. On thread // deregistration, this reference is cleared and the RegisteredThread is // destroyed. static MOZ_THREAD_LOCAL(class RegisteredThread*) sRegisteredThread;
};
// Although you can access a thread's ProfilingStack via // TLSRegisteredThread::sRegisteredThread, we also have a second TLS pointer // directly to the ProfilingStack. Here's why. // // - We need to be able to push to and pop from the ProfilingStack in // AutoProfilerLabel. // // - The class functions are hot and must be defined in BaseProfiler.h so they // can be inlined. // // - We don't want to expose TLSRegisteredThread (and RegisteredThread) in // BaseProfiler.h. // // This second pointer isn't ideal, but does provide a way to satisfy those // constraints. TLSRegisteredThread is responsible for updating it.
MOZ_THREAD_LOCAL(ProfilingStack*) AutoProfilerLabel::sProfilingStack;
// The name of the main thread. staticconstchar* const kMainThreadName = "GeckoMain";
//////////////////////////////////////////////////////////////////////// // BEGIN sampling/unwinding code
// Additional registers that have to be saved when thread is paused. #ifdefined(GP_PLAT_x86_linux) || defined(GP_PLAT_x86_android) || \ defined(GP_ARCH_x86) # define UNWINDING_REGS_HAVE_ECX_EDX #elifdefined(GP_PLAT_amd64_linux) || defined(GP_PLAT_amd64_android) || \ defined(GP_PLAT_amd64_freebsd) || defined(GP_ARCH_amd64) || \ defined(__x86_64__) # define UNWINDING_REGS_HAVE_R10_R12 #elifdefined(GP_PLAT_arm_linux) || defined(GP_PLAT_arm_android) # define UNWINDING_REGS_HAVE_LR_R7 #elifdefined(GP_PLAT_arm64_linux) || defined(GP_PLAT_arm64_android) || \ defined(GP_PLAT_arm64_freebsd) || defined(GP_ARCH_arm64) || \ defined(__aarch64__) # define UNWINDING_REGS_HAVE_LR_R11 #endif
// The registers used for stack unwinding and a few other sampling purposes. // The ctor does nothing; users are responsible for filling in the fields. class Registers { public:
Registers()
: mPC{nullptr},
mSP{nullptr},
mFP{nullptr} #ifdefined(UNWINDING_REGS_HAVE_ECX_EDX)
,
mEcx{nullptr},
mEdx{nullptr} #elifdefined(UNWINDING_REGS_HAVE_R10_R12)
,
mR10{nullptr},
mR12{nullptr} #elifdefined(UNWINDING_REGS_HAVE_LR_R7)
,
mLR{nullptr},
mR7{nullptr} #elifdefined(UNWINDING_REGS_HAVE_LR_R11)
,
mLR{nullptr},
mR11{nullptr} #endif
{
}
void Clear() { memset(this, 0, sizeof(*this)); }
// These fields are filled in by // Sampler::SuspendAndSampleAndResumeThread() for periodic and backtrace // samples, and by REGISTERS_SYNC_POPULATE for synchronous samples.
Address mPC; // Instruction pointer.
Address mSP; // Stack pointer.
Address mFP; // Frame pointer. #ifdefined(UNWINDING_REGS_HAVE_ECX_EDX)
Address mEcx; // Temp for return address.
Address mEdx; // Temp for frame pointer. #elifdefined(UNWINDING_REGS_HAVE_R10_R12)
Address mR10; // Temp for return address.
Address mR12; // Temp for frame pointer. #elifdefined(UNWINDING_REGS_HAVE_LR_R7)
Address mLR; // ARM link register, or temp for return address.
Address mR7; // Temp for frame pointer. #elifdefined(UNWINDING_REGS_HAVE_LR_R11)
Address mLR; // ARM link register, or temp for return address.
Address mR11; // Temp for frame pointer. #endif
#ifdefined(GP_OS_linux) || defined(GP_OS_android) || defined(GP_OS_freebsd) // This contains all the registers, which means it duplicates the four fields // above. This is ok.
ucontext_t* mContext; // The context from the signal handler. #endif
};
// Setting MAX_NATIVE_FRAMES too high risks the unwinder wasting a lot of time // looping on corrupted stacks. staticconst size_t MAX_NATIVE_FRAMES = 1024;
struct NativeStack { void* mPCs[MAX_NATIVE_FRAMES]; void* mSPs[MAX_NATIVE_FRAMES];
size_t mCount; // Number of frames filled.
NativeStack() : mPCs(), mSPs(), mCount(0) {}
};
// Merges the profiling stack and native stack, outputting the details to // aCollector. staticvoid MergeStacks(bool aIsSynchronous, const RegisteredThread& aRegisteredThread, const NativeStack& aNativeStack,
ProfilerStackCollector& aCollector) { // WARNING: this function runs within the profiler's "critical section". // WARNING: this function might be called while the profiler is inactive, and // cannot rely on ActivePS.
Maybe<uint64_t> samplePosInBuffer; if (!aIsSynchronous) { // aCollector.SamplePositionInBuffer() will return Nothing() when // profiler_suspend_and_sample_thread is called from the background hang // reporter.
samplePosInBuffer = aCollector.SamplePositionInBuffer();
} // While the profiling stack array is ordered oldest-to-youngest, the JS and // native arrays are ordered youngest-to-oldest. We must add frames to aInfo // oldest-to-youngest. Thus, iterate over the profiling stack forwards and JS // and native arrays backwards. Note: this means the terminating condition // jsIndex and nativeIndex is being < 0.
uint32_t profilingStackIndex = 0;
int32_t nativeIndex = aNativeStack.mCount - 1;
uint8_t* lastLabelFrameStackAddr = nullptr;
// Iterate as long as there is at least one frame remaining. while (profilingStackIndex != profilingStackFrameCount || nativeIndex >= 0) { // There are 1 to 3 frames available. Find and add the oldest.
uint8_t* profilingStackAddr = nullptr;
uint8_t* nativeStackAddr = nullptr;
if (profilingStackIndex != profilingStackFrameCount) { const ProfilingStackFrame& profilingStackFrame =
profilingStackFrames[profilingStackIndex];
if (profilingStackFrame.isLabelFrame() ||
profilingStackFrame.isSpMarkerFrame()) {
lastLabelFrameStackAddr = (uint8_t*)profilingStackFrame.stackAddress();
}
// Skip any JS_OSR frames. Such frames are used when the JS interpreter // enters a jit frame on a loop edge (via on-stack-replacement, or OSR). // To avoid both the profiling stack frame and jit frame being recorded // (and showing up twice), the interpreter marks the interpreter // profiling stack frame as JS_OSR to ensure that it doesn't get counted. if (profilingStackFrame.isOSRFrame()) {
profilingStackIndex++; continue;
}
if (nativeIndex >= 0) {
nativeStackAddr = (uint8_t*)aNativeStack.mSPs[nativeIndex];
}
// If there's a native stack frame which has the same SP as a profiling // stack frame, pretend we didn't see the native stack frame. Ditto for a // native stack frame which has the same SP as a JS stack frame. In effect // this means profiling stack frames or JS frames trump conflicting native // frames. if (nativeStackAddr && (profilingStackAddr == nativeStackAddr)) {
nativeStackAddr = nullptr;
nativeIndex--;
MOZ_ASSERT(profilingStackAddr);
}
// Check to see if profiling stack frame is top-most. if (profilingStackAddr > nativeStackAddr) {
MOZ_ASSERT(profilingStackIndex < profilingStackFrameCount); const ProfilingStackFrame& profilingStackFrame =
profilingStackFrames[profilingStackIndex];
// Sp marker frames are just annotations and should not be recorded in // the profile. if (!profilingStackFrame.isSpMarkerFrame()) { if (aIsSynchronous && profilingStackFrame.categoryPair() ==
ProfilingCategoryPair::PROFILER) { // For stacks captured synchronously (ie. marker stacks), stop // walking the stack as soon as we enter the profiler category, // to avoid showing profiler internal code in marker stacks. return;
}
aCollector.CollectProfilingStackFrame(profilingStackFrame);
}
profilingStackIndex++; continue;
}
// If we reach here, there must be a native stack frame and it must be the // greatest frame. if (nativeStackAddr) {
MOZ_ASSERT(nativeIndex >= 0); void* addr = (void*)aNativeStack.mPCs[nativeIndex];
aCollector.CollectNativeLeafAddr(addr);
} if (nativeIndex >= 0) {
nativeIndex--;
}
}
}
#ifdefined(USE_FRAME_POINTER_STACK_WALK) staticvoid DoFramePointerBacktrace(PSLockRef aLock, const RegisteredThread& aRegisteredThread, const Registers& aRegs,
NativeStack& aNativeStack) { // WARNING: this function runs within the profiler's "critical section". // WARNING: this function might be called while the profiler is inactive, and // cannot rely on ActivePS.
// Start with the current function. We use 0 as the frame number here because // the FramePointerStackWalk() call below will use 1..N. This is a bit weird // but it doesn't matter because StackWalkCallback() doesn't use the frame // number argument.
StackWalkCallback(/* frameNum */ 0, aRegs.mPC, aRegs.mSP, &aNativeStack);
#ifdefined(USE_MOZ_STACK_WALK) staticvoid DoMozStackWalkBacktrace(PSLockRef aLock, const RegisteredThread& aRegisteredThread, const Registers& aRegs,
NativeStack& aNativeStack) { // WARNING: this function runs within the profiler's "critical section". // WARNING: this function might be called while the profiler is inactive, and // cannot rely on ActivePS.
// Start with the current function. We use 0 as the frame number here because // the MozStackWalkThread() call below will use 1..N. This is a bit weird but // it doesn't matter because StackWalkCallback() doesn't use the frame number // argument.
StackWalkCallback(/* frameNum */ 0, aRegs.mPC, aRegs.mSP, &aNativeStack);
#ifdef USE_EHABI_STACKWALK staticvoid DoEHABIBacktrace(PSLockRef aLock, const RegisteredThread& aRegisteredThread, const Registers& aRegs,
NativeStack& aNativeStack) { // WARNING: this function runs within the profiler's "critical section". // WARNING: this function might be called while the profiler is inactive, and // cannot rely on ActivePS.
// See the comment at the callsite for why this function is necessary. # ifdefined(MOZ_HAVE_ASAN_IGNORE)
MOZ_ASAN_IGNORE staticvoid ASAN_memcpy(void* aDst, constvoid* aSrc,
size_t aLen) { // The obvious thing to do here is call memcpy(). However, although // ASAN_memcpy() is not instrumented by ASAN, memcpy() still is, and the // false positive still manifests! So we must implement memcpy() ourselves // within this function. char* dst = static_cast<char*>(aDst); constchar* src = static_cast<constchar*>(aSrc);
for (size_t i = 0; i < aLen; i++) {
dst[i] = src[i];
}
} # endif
staticvoid DoLULBacktrace(PSLockRef aLock, const RegisteredThread& aRegisteredThread, const Registers& aRegs, NativeStack& aNativeStack) { // WARNING: this function runs within the profiler's "critical section". // WARNING: this function might be called while the profiler is inactive, and // cannot rely on ActivePS.
const mcontext_t* mc = &aRegs.mContext->uc_mcontext;
// Copy up to N_STACK_BYTES from rsp-REDZONE upwards, but not going past the // stack's registered top point. Do some basic sanity checks too. This // assumes that the TaggedUWord holding the stack pointer value is valid, but // it should be, since it was constructed that way in the code just above.
// We could construct |stackImg| so that LUL reads directly from the stack in // question, rather than from a copy of it. That would reduce overhead and // space use a bit. However, it gives a problem with dynamic analysis tools // (ASan, TSan, Valgrind) which is that such tools will report invalid or // racing memory accesses, and such accesses will be reported deep inside LUL. // By taking a copy here, we can either sanitise the copy (for Valgrind) or // copy it using an unchecked memcpy (for ASan, TSan). That way we don't have // to try and suppress errors inside LUL. // // N_STACK_BYTES is set to 160KB. This is big enough to hold all stacks // observed in some minutes of testing, whilst keeping the size of this // function (DoNativeBacktrace)'s frame reasonable. Most stacks observed in // practice are small, 4KB or less, and so the copy costs are insignificant // compared to other profiler overhead. // // |stackImg| is allocated on this (the sampling thread's) stack. That // implies that the frame for this function is at least N_STACK_BYTES large. // In general it would be considered unacceptable to have such a large frame // on a stack, but it only exists for the unwinder thread, and so is not // expected to be a problem. Allocating it on the heap is troublesome because // this function runs whilst the sampled thread is suspended, so any heap // allocation risks deadlock. Allocating it as a global variable is not // thread safe, which would be a problem if we ever allow multiple sampler // threads. Hence allocating it on the stack seems to be the least-worst // option.
lul::StackImage stackImg;
{ # ifdefined(GP_PLAT_amd64_linux) || defined(GP_PLAT_amd64_android) || \ defined(GP_PLAT_amd64_freebsd)
uintptr_t rEDZONE_SIZE = 128;
uintptr_t start = startRegs.xsp.Value() - rEDZONE_SIZE; # elif defined(GP_PLAT_arm_linux) || defined(GP_PLAT_arm_android)
uintptr_t rEDZONE_SIZE = 0;
uintptr_t start = startRegs.r13.Value() - rEDZONE_SIZE; # elif defined(GP_PLAT_arm64_linux) || defined(GP_PLAT_arm64_android) || \ defined(GP_PLAT_arm64_freebsd)
uintptr_t rEDZONE_SIZE = 0;
uintptr_t start = startRegs.sp.Value() - rEDZONE_SIZE; # elif defined(GP_PLAT_x86_linux) || defined(GP_PLAT_x86_android)
uintptr_t rEDZONE_SIZE = 0;
uintptr_t start = startRegs.xsp.Value() - rEDZONE_SIZE; # elif defined(GP_PLAT_mips64_linux)
uintptr_t rEDZONE_SIZE = 0;
uintptr_t start = startRegs.sp.Value() - rEDZONE_SIZE; # else # error "Unknown plat" # endif
uintptr_t end = reinterpret_cast<uintptr_t>(aRegisteredThread.StackTop());
uintptr_t ws = sizeof(void*);
start &= ~(ws - 1);
end &= ~(ws - 1);
uintptr_t nToCopy = 0; if (start < end) {
nToCopy = end - start; if (nToCopy > lul::N_STACK_BYTES) nToCopy = lul::N_STACK_BYTES;
}
MOZ_ASSERT(nToCopy <= lul::N_STACK_BYTES);
stackImg.mLen = nToCopy;
stackImg.mStartAvma = start; if (nToCopy > 0) { // If this is a vanilla memcpy(), ASAN makes the following complaint: // // ERROR: AddressSanitizer: stack-buffer-underflow ... // ... // HINT: this may be a false positive if your program uses some custom // stack unwind mechanism or swapcontext // // This code is very much a custom stack unwind mechanism! So we use an // alternative memcpy() implementation that is ignored by ASAN. # ifdefined(MOZ_HAVE_ASAN_IGNORE)
ASAN_memcpy(&stackImg.mContents[0], (void*)start, nToCopy); # else
memcpy(&stackImg.mContents[0], (void*)start, nToCopy); # endif
(void)VALGRIND_MAKE_MEM_DEFINED(&stackImg.mContents[0], nToCopy);
}
}
// Update stats in the LUL stats object. Unfortunately this requires // three global memory operations.
lul->mStats.mContext += 1;
lul->mStats.mCFI += aNativeStack.mCount - 1 - framePointerFramesAcquired;
lul->mStats.mFP += framePointerFramesAcquired;
}
#endif
#ifdef HAVE_NATIVE_UNWIND staticvoid DoNativeBacktrace(PSLockRef aLock, const RegisteredThread& aRegisteredThread, const Registers& aRegs,
NativeStack& aNativeStack) { // This method determines which stackwalker is used for periodic and // synchronous samples. (Backtrace samples are treated differently, see // profiler_suspend_and_sample_thread() for details). The only part of the // ordering that matters is that LUL must precede FRAME_POINTER, because on // Linux they can both be present. # ifdefined(USE_LUL_STACKWALK)
DoLULBacktrace(aLock, aRegisteredThread, aRegs, aNativeStack); # elif defined(USE_EHABI_STACKWALK)
DoEHABIBacktrace(aLock, aRegisteredThread, aRegs, aNativeStack); # elif defined(USE_FRAME_POINTER_STACK_WALK)
DoFramePointerBacktrace(aLock, aRegisteredThread, aRegs, aNativeStack); # elif defined(USE_MOZ_STACK_WALK)
DoMozStackWalkBacktrace(aLock, aRegisteredThread, aRegs, aNativeStack); # else # error "Invalid configuration" # endif
} #endif
// Writes some components shared by periodic and synchronous profiles to // ActivePS's ProfileBuffer. (This should only be called from DoSyncSample() // and DoPeriodicSample().) // // The grammar for entry sequences is in a comment above // ProfileBuffer::StreamSamplesToJSON. staticinlinevoid DoSharedSample(
PSLockRef aLock, bool aIsSynchronous, RegisteredThread& aRegisteredThread, const Registers& aRegs, uint64_t aSamplePos, uint64_t aBufferRangeStart,
ProfileBuffer& aBuffer,
StackCaptureOptions aCaptureOptions = StackCaptureOptions::Full) { // WARNING: this function runs within the profiler's "critical section".
MOZ_ASSERT(!aBuffer.IsThreadSafe(), "Mutexes cannot be used inside this critical section");
// We can't walk the whole native stack, but we can record the top frame. if (aCaptureOptions == StackCaptureOptions::Full) {
aBuffer.AddEntry(ProfileBufferEntry::NativeLeafAddr((void*)aRegs.mPC));
}
}
}
// Writes the components of a synchronous sample to the given ProfileBuffer. staticvoid DoSyncSample(PSLockRef aLock, RegisteredThread& aRegisteredThread, const TimeStamp& aNow, const Registers& aRegs,
ProfileBuffer& aBuffer,
StackCaptureOptions aCaptureOptions) { // WARNING: this function runs within the profiler's "critical section".
MOZ_ASSERT(aCaptureOptions != StackCaptureOptions::NoStack, "DoSyncSample should not be called when no capture is needed");
// Writes the components of a periodic sample to ActivePS's ProfileBuffer. // The ThreadId entry is already written in the main ProfileBuffer, its location // is `aSamplePos`, we can write the rest to `aBuffer` (which may be different). staticvoid DoPeriodicSample(PSLockRef aLock,
RegisteredThread& aRegisteredThread,
ProfiledThreadData& aProfiledThreadData, const Registers& aRegs, uint64_t aSamplePos,
uint64_t aBufferRangeStart,
ProfileBuffer& aBuffer) { // WARNING: this function runs within the profiler's "critical section".
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.