Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

avoid calling libkineto::api().client()->stop twice #1029

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions libkineto/src/CuptiActivityProfiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1138,6 +1138,11 @@ void CuptiActivityProfiler::configure(
currentRunloopState_ = RunloopState::Warmup;
}

bool CuptiActivityProfiler::getCollectTraceState() {
std::lock_guard<std::recursive_mutex> guard(collectTraceStateMutex_);
return isCollectingTrace;
}

void CuptiActivityProfiler::collectTrace(
bool collection_done,
const std::chrono::time_point<std::chrono::system_clock>& now) {
Expand Down Expand Up @@ -1314,13 +1319,17 @@ const time_point<system_clock> CuptiActivityProfiler::performRunLoopStep(
VLOG_IF(1, currentIter >= 0)
<< "This state change was invoked by application's step() call";

// currentIter >= 0 means this is an iteration-based collection,
// triggered by pytorch main thread, it should be executed in another
// currentIter >= 0 means this is called from the step() api of
// the profile in pytorch main thread, it should be executed in another
// thread in case pytorch main thread is blocked
if (currentIter >= 0) {
// if collectTraceThread_ is already running, there's no need to
// execute collectTrace twice.
if (!collectTraceThread_) {
// Do not call collectTrace when profilerThread_ is collecting Trace.
// Otherwise, libkineto::api().client()->stop will be called twice,
// which leads to an unrecoverable ::c10:Error at
// disableProfiler
if (!collectTraceThread_ && !getCollectTraceState()) {
std::lock_guard<std::recursive_mutex> guard(mutex_);
collectTraceThread_ = std::make_unique<std::thread>(
&CuptiActivityProfiler::collectTrace,
Expand All @@ -1330,7 +1339,16 @@ const time_point<system_clock> CuptiActivityProfiler::performRunLoopStep(
}
break;
}
// this is executed in profilerThread_
{
std::lock_guard<std::recursive_mutex> guard(collectTraceStateMutex_);
isCollectingTrace = true;
}
collectTrace(collection_done, now);
{
std::lock_guard<std::recursive_mutex> guard(collectTraceStateMutex_);
isCollectingTrace = false;
}
} else if (derivedConfig_->isProfilingByIteration()) {
// nothing to do here
} else if (
Expand Down
6 changes: 6 additions & 0 deletions libkineto/src/CuptiActivityProfiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,8 @@ class CuptiActivityProfiler {

void checkTimestampOrder(const ITraceActivity* act1);

bool getCollectTraceState();

// On-demand Request Config (should not be modified)
// TODO: remove this config_, dependency needs to be removed from
// finalizeTrace.
Expand Down Expand Up @@ -494,6 +496,10 @@ class CuptiActivityProfiler {
// details.
std::unique_ptr<std::thread> collectTraceThread_{nullptr};

// Add a mutex to protect state for CollectTrace
std::recursive_mutex collectTraceStateMutex_;
bool isCollectingTrace{false};

// Runloop phase
std::atomic<RunloopState> currentRunloopState_{RunloopState::WaitForRequest};

Expand Down