Skip to content

Commit

Permalink
Merge pull request #1581 from glotzerlab/error-message-in-exception
Browse files Browse the repository at this point in the history
Improve error handling.
  • Loading branch information
joaander authored Jun 26, 2023
2 parents fb325bd + 2e6f64b commit 86e196d
Show file tree
Hide file tree
Showing 14 changed files with 81 additions and 77 deletions.
31 changes: 16 additions & 15 deletions hoomd/CellList.cc
Original file line number Diff line number Diff line change
Expand Up @@ -567,9 +567,10 @@ bool CellList::checkConditions()
ArrayHandle<unsigned int> h_tag(m_pdata->getTags(),
access_location::host,
access_mode::read);
m_exec_conf->msg->errorAllRanks()
<< "Particle with unique tag " << h_tag.data[n] << " has NaN for its position." << endl;
throw runtime_error("Error computing cell list");

ostringstream s;
s << "Particle with unique tag " << h_tag.data[n] << " has NaN for its position." << endl;
throw runtime_error(s.str());
}

// detect particles leaving box errors
Expand All @@ -588,18 +589,18 @@ bool CellList::checkConditions()
Scalar3 lo = m_pdata->getBox().getLo();
Scalar3 hi = m_pdata->getBox().getHi();

m_exec_conf->msg->errorAllRanks()
<< "Particle with unique tag " << h_tag.data[n]
<< " is no longer in the simulation box." << std::endl
<< std::endl
<< "Cartesian coordinates: " << std::endl
<< "x: " << h_pos.data[n].x << " y: " << h_pos.data[n].y << " z: " << h_pos.data[n].z
<< std::endl
<< "Fractional coordinates: " << std::endl
<< "f.x: " << f.x << " f.y: " << f.y << " f.z: " << f.z << std::endl
<< "Local box lo: (" << lo.x << ", " << lo.y << ", " << lo.z << ")" << std::endl
<< " hi: (" << hi.x << ", " << hi.y << ", " << hi.z << ")" << std::endl;
throw runtime_error("Error computing cell list");
ostringstream s;
s << "Particle with unique tag " << h_tag.data[n] << " is no longer in the simulation box."
<< std::endl
<< std::endl
<< "Cartesian coordinates: " << std::endl
<< "x: " << h_pos.data[n].x << " y: " << h_pos.data[n].y << " z: " << h_pos.data[n].z
<< std::endl
<< "Fractional coordinates: " << std::endl
<< "f.x: " << f.x << " f.y: " << f.y << " f.z: " << f.z << std::endl
<< "Local box lo: (" << lo.x << ", " << lo.y << ", " << lo.z << ")" << std::endl
<< " hi: (" << hi.x << ", " << hi.y << ", " << hi.z << ")" << std::endl;
throw runtime_error(s.str());
}

return result;
Expand Down
42 changes: 29 additions & 13 deletions hoomd/ExecutionConfiguration.cc
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,16 @@ ExecutionConfiguration::ExecutionConfiguration(executionMode mode,
{
// if we found a local rank, use that to select the GPU
gpu_id.push_back((local_rank % dev_count));

ostringstream s;
s << "Selected GPU " << gpu_id[0] << " by MPI rank." << endl;
msg->collectiveNoticeStr(4, s.str());
}

if (!gpu_id.size())
{
// auto-detect a single GPU
msg->collectiveNoticeStr(4, "Asking the driver to choose a GPU.\n");
initializeGPU(-1);
}
else
Expand All @@ -136,15 +141,22 @@ ExecutionConfiguration::ExecutionConfiguration(executionMode mode,

setupStats();

s.clear();
s << "Device is running on ";
for (const auto& device_description : m_active_device_descriptions)
{
s << device_description << " ";
}
s << endl;
msg->collectiveNoticeStr(3, s.str());

#if defined(ENABLE_HIP)
if (exec_mode == GPU)
{
if (!m_concurrent && gpu_id.size() > 1)
{
msg->errorAllRanks() << "Multi-GPU execution requested, but not all GPUs support "
"concurrent managed access"
<< endl;
throw runtime_error("Error initializing execution configuration");
throw runtime_error("Multi-GPU execution requested, but not all GPUs support "
"concurrent managed access");
}

#ifndef ALWAYS_USE_MANAGED_MEMORY
Expand Down Expand Up @@ -177,7 +189,7 @@ ExecutionConfiguration::ExecutionConfiguration(executionMode mode,
// select first device by default
hipSetDevice(m_gpu_id[0]);

hipError_t err_sync = hipGetLastError();
hipError_t err_sync = hipPeekAtLastError();
handleHIPError(err_sync, __FILE__, __LINE__);

// initialize cached allocator, max allocation 0.5*global mem
Expand Down Expand Up @@ -278,12 +290,17 @@ void ExecutionConfiguration::handleHIPError(hipError_t err,
if (strlen(file) > strlen(HOOMD_SOURCE_DIR))
file += strlen(HOOMD_SOURCE_DIR);

// print an error message
msg->errorAllRanks() << string(hipGetErrorString(err)) << " before " << file << ":" << line
<< endl;
std::ostringstream s;
#ifdef __HIP_PLATFORM_NVCC__
cudaError_t cuda_error = cudaPeekAtLastError();
s << "CUDA Error: " << string(cudaGetErrorString(cuda_error));
#else
s << "HIP Error: " << string(hipGetErrorString(err));
#endif
s << " before " << file << ":" << line;

// throw an error exception
throw(runtime_error("HIP Error"));
throw(runtime_error(s.str()));
}
}

Expand Down Expand Up @@ -347,7 +364,7 @@ void ExecutionConfiguration::initializeGPU(int gpu_id)
// add to list of active GPUs
m_gpu_id.push_back(hip_gpu_id);

hipError_t err_sync = hipGetLastError();
hipError_t err_sync = hipPeekAtLastError();
handleHIPError(err_sync, __FILE__, __LINE__);
}

Expand Down Expand Up @@ -483,7 +500,6 @@ void ExecutionConfiguration::setupStats()
cudaError_t error = cudaGetDeviceProperties(&cuda_prop, m_gpu_id[idev]);
if (error != cudaSuccess)
{
msg->errorAllRanks() << "" << endl;
throw runtime_error("Failed to get device properties: "
+ string(cudaGetErrorString(error)));
}
Expand Down Expand Up @@ -561,7 +577,7 @@ void ExecutionConfiguration::beginMultiGPU() const

if (isCUDAErrorCheckingEnabled())
{
hipError_t err_sync = hipGetLastError();
hipError_t err_sync = hipPeekAtLastError();
handleHIPError(err_sync, __FILE__, __LINE__);
}
}
Expand Down Expand Up @@ -592,7 +608,7 @@ void ExecutionConfiguration::endMultiGPU() const

if (isCUDAErrorCheckingEnabled())
{
hipError_t err_sync = hipGetLastError();
hipError_t err_sync = hipPeekAtLastError();
handleHIPError(err_sync, __FILE__, __LINE__);
}
}
Expand Down
4 changes: 1 addition & 3 deletions hoomd/ExecutionConfiguration.h
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,6 @@ class PYBIND11_EXPORT ExecutionConfiguration
/// Compute capability of the GPU formatted as a tuple (major, minor)
std::pair<unsigned int, unsigned int> getComputeCapability(unsigned int igpu = 0) const;

//! Handle cuda error message
void handleCUDAError(hipError_t err, const char* file, unsigned int line) const;
//! Handle hip error message
void handleHIPError(hipError_t err, const char* file, unsigned int line) const;
#endif
Expand Down Expand Up @@ -418,7 +416,7 @@ class PYBIND11_EXPORT ExecutionConfiguration
#if defined(ENABLE_HIP)
#define CHECK_CUDA_ERROR() \
{ \
hipError_t err_sync = hipGetLastError(); \
hipError_t err_sync = hipPeekAtLastError(); \
this->m_exec_conf->handleHIPError(err_sync, __FILE__, __LINE__); \
auto gpu_map = this->m_exec_conf->getGPUIds(); \
for (int idev = this->m_exec_conf->getNumActiveGPUs() - 1; idev >= 0; --idev) \
Expand Down
6 changes: 0 additions & 6 deletions hoomd/GPUArray.h
Original file line number Diff line number Diff line change
Expand Up @@ -975,8 +975,6 @@ template<class T> void GPUArray<T>::allocate()
int retval = posix_memalign(&host_ptr, 32, m_num_elements * sizeof(T));
if (retval != 0)
{
if (m_exec_conf)
m_exec_conf->msg->errorAllRanks() << "Error allocating aligned memory" << std::endl;
throw std::bad_alloc();
}

Expand Down Expand Up @@ -1345,8 +1343,6 @@ template<class T> T* GPUArray<T>::resizeHostArray(size_t num_elements)
int retval = posix_memalign((void**)&h_tmp, 32, num_elements * sizeof(T));
if (retval != 0)
{
if (m_exec_conf)
m_exec_conf->msg->errorAllRanks() << "Error allocating aligned memory" << std::endl;
throw std::bad_alloc();
}

Expand Down Expand Up @@ -1411,8 +1407,6 @@ T* GPUArray<T>::resize2DHostArray(size_t pitch, size_t new_pitch, size_t height,
int retval = posix_memalign((void**)&h_tmp, 32, size);
if (retval != 0)
{
if (m_exec_conf)
m_exec_conf->msg->errorAllRanks() << "Error allocating aligned memory" << std::endl;
throw std::bad_alloc();
}

Expand Down
6 changes: 2 additions & 4 deletions hoomd/GPUFlags.h
Original file line number Diff line number Diff line change
Expand Up @@ -226,8 +226,7 @@ template<class T> void GPUFlags<T>::allocate()
int retval = posix_memalign(&ptr, getpagesize(), sizeof(T));
if (retval != 0)
{
m_exec_conf->msg->errorAllRanks() << "Error allocating aligned memory" << std::endl;
throw std::runtime_error("Error allocating GPUArray.");
throw std::runtime_error("Error allocating aligned memory.");
}
h_data = (T*)ptr;
hipHostRegister(h_data, sizeof(T), hipHostRegisterMapped);
Expand All @@ -245,8 +244,7 @@ template<class T> void GPUFlags<T>::allocate()
int retval = posix_memalign(&ptr, getpagesize(), sizeof(T));
if (retval != 0)
{
m_exec_conf->msg->errorAllRanks() << "Error allocating aligned memory" << std::endl;
throw std::runtime_error("Error allocating GPUArray.");
throw std::runtime_error("Error allocating aligned memory.");
}
h_data = (T*)ptr;
hipHostRegister(h_data, sizeof(T), hipHostRegisterDefault);
Expand Down
3 changes: 1 addition & 2 deletions hoomd/hpmc/IntegratorHPMCMono.h
Original file line number Diff line number Diff line change
Expand Up @@ -1638,8 +1638,7 @@ void IntegratorHPMCMono<Shape>::growAABBList(unsigned int N)
int retval = posix_memalign((void**)&m_aabbs, 32, N*sizeof(hoomd::detail::AABB));
if (retval != 0)
{
m_exec_conf->msg->errorAllRanks() << "Error allocating aligned memory" << std::endl;
throw std::runtime_error("Error allocating AABB memory");
throw std::runtime_error("Error allocating aligned memory.");
}
}
}
Expand Down
3 changes: 1 addition & 2 deletions hoomd/md/BondTablePotential.cc
Original file line number Diff line number Diff line change
Expand Up @@ -284,8 +284,7 @@ void BondTablePotential::computeForces(uint64_t timestep)
}
else
{
m_exec_conf->msg->errorAllRanks() << "Table bond out of bounds" << endl;
throw std::runtime_error("Error in bond calculation");
throw std::runtime_error("Table bond out of bounds.");
}
}
}
Expand Down
3 changes: 1 addition & 2 deletions hoomd/md/BondTablePotentialGPU.cc
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,7 @@ void BondTablePotentialGPU::computeForces(uint64_t timestep)

if (h_flags.data[0])
{
m_exec_conf->msg->errorAllRanks() << endl << "Table bond out of bounds" << endl << endl;
throw std::runtime_error("Error in bond calculation");
throw std::runtime_error("Table bond out of bounds.");
}
}
m_tuner->end();
Expand Down
26 changes: 13 additions & 13 deletions hoomd/md/ForceCompositeGPU.cc
Original file line number Diff line number Diff line change
Expand Up @@ -218,10 +218,10 @@ void ForceCompositeGPU::computeForces(uint64_t timestep)

if (flag.x)
{
m_exec_conf->msg->errorAllRanks() << "constrain.rigid(): Composite particle with body tag "
<< flag.x - 1 << " incomplete" << std::endl
<< std::endl;
throw std::runtime_error("Error computing composite particle forces.\n");
std::ostringstream s;
s << "Composite particle with body tag " << flag.x - 1 << " incomplete" << std::endl
<< std::endl;
throw std::runtime_error(s.str());
}

m_tuner_force->end();
Expand Down Expand Up @@ -394,11 +394,11 @@ void ForceCompositeGPU::updateCompositeParticles(uint64_t timestep)
unsigned int body_id = h_body.data[idx];
unsigned int tag = h_tag.data[idx];

m_exec_conf->msg->errorAllRanks()
<< "constrain.rigid(): Particle " << tag << " part of composite body " << body_id
<< " is missing central particle" << std::endl
<< std::endl;
throw std::runtime_error("Error while updating constituent particles");
std::ostringstream s;
s << "Particle " << tag << " part of composite body " << body_id
<< " is missing central particle" << std::endl
<< std::endl;
throw std::runtime_error(s.str());
}

if (flag.y)
Expand All @@ -410,10 +410,10 @@ void ForceCompositeGPU::updateCompositeParticles(uint64_t timestep)
unsigned int idx = flag.y - 1;
unsigned int body_id = h_body.data[idx];

m_exec_conf->msg->errorAllRanks() << "constrain.rigid(): Composite particle with body id "
<< body_id << " incomplete" << std::endl
<< std::endl;
throw std::runtime_error("Error while updating constituent particles");
std::ostringstream s;
s << "Composite particle with body id " << body_id << " incomplete" << std::endl
<< std::endl;
throw std::runtime_error(s.str());
}
}

Expand Down
10 changes: 5 additions & 5 deletions hoomd/md/NeighborListTree.cc
Original file line number Diff line number Diff line change
Expand Up @@ -243,11 +243,11 @@ void NeighborListTree::buildTree()
ArrayHandle<unsigned int> h_tag(m_pdata->getTags(),
access_location::host,
access_mode::read);
m_exec_conf->msg->errorAllRanks()
<< "nlist.tree(): Particle " << h_tag.data[i] << " is out of bounds "
<< "(x: " << my_pos.x << ", y: " << my_pos.y << ", z: " << my_pos.z
<< ", fx: " << f.x << ", fy: " << f.y << ", fz:" << f.z << ")" << endl;
throw runtime_error("Error updating neighborlist");
ostringstream s;
s << "Particle " << h_tag.data[i] << " is out of bounds "
<< "(x: " << my_pos.x << ", y: " << my_pos.y << ", z: " << my_pos.z << ", fx: " << f.x
<< ", fy: " << f.y << ", fz:" << f.z << ")" << endl;
throw runtime_error(s.str());
return;
}

Expand Down
6 changes: 3 additions & 3 deletions hoomd/test/test_global_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ UP_TEST(GlobalArray_transfer_tests)
UP_ASSERT(d_handle.data != NULL);

gpu_fill_test_pattern(d_handle.data, gpu_array.getNumElements());
hipError_t err_sync = hipGetLastError();
hipError_t err_sync = hipPeekAtLastError();
exec_conf->handleHIPError(err_sync, __FILE__, __LINE__);
}

Expand All @@ -127,7 +127,7 @@ UP_TEST(GlobalArray_transfer_tests)
UP_ASSERT(d_handle.data != NULL);

gpu_add_one(d_handle.data, gpu_array.getNumElements());
hipError_t err_sync = hipGetLastError();
hipError_t err_sync = hipPeekAtLastError();
exec_conf->handleHIPError(err_sync, __FILE__, __LINE__);
}

Expand All @@ -146,7 +146,7 @@ UP_TEST(GlobalArray_transfer_tests)
UP_ASSERT(d_handle.data != NULL);

gpu_add_one(d_handle.data, gpu_array.getNumElements());
hipError_t err_sync = hipGetLastError();
hipError_t err_sync = hipPeekAtLastError();
exec_conf->handleHIPError(err_sync, __FILE__, __LINE__);
}

Expand Down
4 changes: 2 additions & 2 deletions hoomd/test/test_global_array.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ hipError_t gpu_add_one(int* d_data, size_t num)
hipLaunchKernelGGL((gpu_add_one_kernel), dim3(grid), dim3(threads), 0, 0, d_data, num);

hipDeviceSynchronize();
return hipGetLastError();
return hipPeekAtLastError();
}

/*! \param d_data Device pointer to the array where the data is held
Expand Down Expand Up @@ -79,7 +79,7 @@ hipError_t gpu_fill_test_pattern(int* d_data, size_t num)
num);

hipDeviceSynchronize();
return hipGetLastError();
return hipPeekAtLastError();
}

} // end namespace test
Expand Down
Loading

0 comments on commit 86e196d

Please sign in to comment.