Skip to content

Commit

Permalink
disable broken devices on Disk Agent registration
Browse files Browse the repository at this point in the history
  • Loading branch information
sharpeye committed Nov 5, 2024
1 parent 3346ef3 commit 32dc4c8
Show file tree
Hide file tree
Showing 28 changed files with 738 additions and 101 deletions.
4 changes: 4 additions & 0 deletions cloud/blockstore/config/disk.proto
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,11 @@ message TDiskAgentConfig

// List of device UUIDs with suspended I/O.
// I/O operations for such a device will result in errors.
// Is used for the config cache file only.
repeated string DevicesWithSuspendedIO = 35;

// Disable devices that have been recognized as broken by the DR
optional bool DisableBrokenDevices = 36;
}

////////////////////////////////////////////////////////////////////////////////
Expand Down
26 changes: 15 additions & 11 deletions cloud/blockstore/libs/rdma_test/rdma_test_environment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,27 @@ namespace NCloud::NBlockStore::NStorage {
TRdmaTestEnvironment::TRdmaTestEnvironment(size_t deviceSize, ui32 poolSize)
: Storage(std::make_shared<TMemoryTestStorage>(deviceSize))
{
THashMap<TString, TStorageAdapterPtr> devices;
devices[Device_1] = std::make_shared<TStorageAdapter>(
Storage,
4_KB, // storageBlockSize
true, // normalize,
TDuration::Zero(), // maxRequestDuration
TDuration::Zero() // shutdownTimeout
);
THashMap<TString, TStorageAdapterPtr> devices{
{Device_1,
std::make_shared<TStorageAdapter>(
Storage,
4_KB, // storageBlockSize
true, // normalize,
TDuration::Zero(), // maxRequestDuration
TDuration::Zero() // shutdownTimeout
)}};

TVector<TString> uuids;
for (const auto& [key, value]: devices) {
uuids.push_back(key);
}
auto deviceClient = std::make_shared<TDeviceClient>(

DeviceClient = std::make_shared<TDeviceClient>(
TDuration::MilliSeconds(100),
uuids,
Logging->CreateLog("BLOCKSTORE_DISK_AGENT"));
deviceClient->AcquireDevices(

DeviceClient->AcquireDevices(
uuids,
ClientId,
TInstant::Now(),
Expand Down Expand Up @@ -60,7 +64,7 @@ TRdmaTestEnvironment::TRdmaTestEnvironment(size_t deviceSize, ui32 poolSize)
std::move(oldRequestCounters),
Logging,
Server,
std::move(deviceClient),
DeviceClient,
std::move(devices));

RdmaTarget->Start();
Expand Down
2 changes: 2 additions & 0 deletions cloud/blockstore/libs/rdma_test/rdma_test_environment.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ struct TRdmaTestEnvironment
"console",
TLogSettings{TLOG_RESOURCES});

std::shared_ptr<TDeviceClient> DeviceClient;

TRdmaTestEnvironment(size_t deviceSize = 4_MB, ui32 poolSize = 1);

virtual ~TRdmaTestEnvironment();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ void TDiskAgentActor::HandleDisableConcreteAgent(
if (record.DeviceUUIDsSize()) {
for (const auto& d: record.GetDeviceUUIDs()) {
State->DisableDevice(d);
State->ReportDisabledDeviceError(d);
}
} else {
HandlePoisonPill(nullptr, ctx);
Expand Down
11 changes: 0 additions & 11 deletions cloud/blockstore/libs/storage/disk_agent/disk_agent_actor_io.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,17 +178,6 @@ void TDiskAgentActor::PerformIO(
started);
};

if (State->IsDeviceDisabled(deviceUUID)) {
LOG_INFO(ctx, TBlockStoreComponents::DISK_AGENT,
"Dropped %s request to device %s, session %s",
TMethod::Name,
deviceUUID.c_str(),
clientId.c_str());
State->ReportDisabledDeviceError(deviceUUID);
replyError(E_IO, "Device disabled");
return;
}

LOG_TRACE(ctx, TBlockStoreComponents::DISK_AGENT,
"%s [%s / %s]",
TMethod::Name,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ void TRegisterActor::HandleRegisterAgentResponse(

auto response = std::make_unique<TEvDiskAgentPrivate::TEvRegisterAgentResponse>(
msg->GetError());
response->DevicesToSuspendIO.assign(
msg->Record.GetDevicesToSuspendIO().cbegin(),
msg->Record.GetDevicesToSuspendIO().cend());
NCloud::Reply(ctx, *RequestInfo, std::move(response));
}

Expand Down Expand Up @@ -144,6 +147,7 @@ void TDiskAgentActor::HandleRegisterAgentResponse(
if (!HasError(msg->GetError())) {
RegistrationState = ERegistrationState::Registered;
LOG_INFO(ctx, TBlockStoreComponents::DISK_AGENT, "Register completed");
State->UpdateDevicesWithSuspendedIO(msg->DevicesToSuspendIO);
} else {
LOG_WARN(ctx, TBlockStoreComponents::DISK_AGENT,
"Register failed: %s. Try later", FormatError(msg->GetError()).c_str());
Expand Down
Loading

0 comments on commit 32dc4c8

Please sign in to comment.