Skip to content

Commit c5a9b8f

Browse files
authored
Etcd observability (#1884)
A few first changes to help debugging connectivity issues we saw in course of #1879. Note that changing the `msg` key is not a (major) breaking change as watching is done using the `msg` prefix and the port parsing in `matchVersion` is defensively done. The version check is bound to change anyways now (not do it on each message!) --- * [x] CHANGELOG update not needed * [x] Documentation update not needed * [x] Haddocks updated * [x] No new TODOs introduced
2 parents d6afe31 + db2f4b0 commit c5a9b8f

File tree

2 files changed

+47
-35
lines changed

2 files changed

+47
-35
lines changed

hydra-node/src/Hydra/Network/Etcd.hs

+44-32
Original file line numberDiff line numberDiff line change
@@ -15,21 +15,21 @@
1515
-- using a GRPC client. We can only write and read from the cluster while
1616
-- connected to the majority cluster.
1717
--
18-
-- Broadcasting is implemented using @put@ to some well-known key, while
19-
-- message delivery is done by using @watch@ on the same key. We keep a last
20-
-- known revision, also stored on disk, to start 'watch' with that revision (+1)
21-
-- and only deliver messages that were not seen before. In case we are not
22-
-- connected to our 'etcd' instance or not enough peers (= on a minority
23-
-- cluster), we retry sending, but also store messages to broadcast in a
24-
-- 'PersistentQueue', which makes the node resilient against crashes while
25-
-- sending. TODO: Is this needed? performance limitation?
18+
-- Broadcasting is implemented using @put@ to some well-known key, while message
19+
-- delivery is done by using @watch@ on the same 'msg' prefix. We keep a last known
20+
-- revision, also stored on disk, to start 'watch' with that revision (+1) and
21+
-- only deliver messages that were not seen before. In case we are not connected
22+
-- to our 'etcd' instance or not enough peers (= on a minority cluster), we
23+
-- retry sending, but also store messages to broadcast in a 'PersistentQueue',
24+
-- which makes the node resilient against crashes while sending. TODO: Is this
25+
-- needed? performance limitation?
2626
--
2727
-- Connectivity and compatibility with other nodes on the cluster is tracked
2828
-- using the key-value service as well:
2929
--
3030
-- * network connectivity is determined by being able to fetch the member list
3131
-- * peer connectivity is tracked (best effort, not authorized) using an entry
32-
-- at 'alive-\<node id\>' keys with individual leases and repeated keep-alives
32+
-- at 'alive-\<advertise\>' keys with individual leases and repeated keep-alives
3333
-- * each node compare-and-swaps its `version` into a key of the same name to
3434
-- check compatibility (not updatable)
3535
--
@@ -73,7 +73,6 @@ import Hydra.Network (
7373
NetworkCallback (..),
7474
NetworkComponent,
7575
NetworkConfiguration (..),
76-
PortNumber,
7776
hydraVersionedProtocolNumber,
7877
)
7978
import Network.GRPC.Client (
@@ -134,7 +133,7 @@ withEtcdNetwork tracer protocolVersion config callback action = do
134133
race_ (pollConnectivity tracer conn advertise callback) $ do
135134
race_ (waitMessages tracer conn protocolVersion persistenceDir callback) $ do
136135
queue <- newPersistentQueue (persistenceDir </> "pending-broadcast") 100
137-
race_ (broadcastMessages tracer conn protocolVersion (port listen) queue) $ do
136+
race_ (broadcastMessages tracer conn protocolVersion advertise queue) $ do
138137
action
139138
Network
140139
{ broadcast = writePersistentQueue queue
@@ -227,22 +226,23 @@ matchVersion ::
227226
Maybe HydraHandshakeRefused
228227
matchVersion key ourVersion = do
229228
case splitOn "-" $ decodeUtf8 key of
230-
[_prefix, versionText, port] ->
229+
[_prefix, versionText, hostText] -> do
230+
let remoteHost = fromMaybe (Host "???" 0) . readMaybe $ toString hostText
231231
case parseVersion versionText of
232232
Just theirVersion
233233
| ourVersion == theirVersion -> Nothing
234234
| otherwise ->
235235
-- TODO: DRY just cases
236236
Just
237237
HydraHandshakeRefused
238-
{ remoteHost = Host "???" $ fromMaybe 0 $ parsePort port
238+
{ remoteHost
239239
, ourVersion
240240
, theirVersions = KnownHydraVersions [theirVersion]
241241
}
242242
Nothing ->
243243
Just
244244
HydraHandshakeRefused
245-
{ remoteHost = Host "???" $ fromMaybe 0 $ parsePort port
245+
{ remoteHost
246246
, ourVersion
247247
, theirVersions = NoKnownHydraVersions
248248
}
@@ -256,8 +256,6 @@ matchVersion key ourVersion = do
256256
where
257257
parseVersion = fmap MkHydraVersionedProtocolNumber . readMaybe . toString
258258

259-
parsePort = readMaybe . toString
260-
261259
-- | Broadcast messages from a queue to the etcd cluster.
262260
--
263261
-- TODO: retrying on failure even needed?
@@ -267,13 +265,14 @@ broadcastMessages ::
267265
Tracer IO EtcdLog ->
268266
Connection ->
269267
HydraVersionedProtocolNumber ->
270-
PortNumber ->
268+
-- | Used to identify sender.
269+
Host ->
271270
PersistentQueue IO msg ->
272271
IO ()
273-
broadcastMessages tracer conn protocolVersion port queue =
272+
broadcastMessages tracer conn protocolVersion ourHost queue =
274273
withGrpcContext "broadcastMessages" . forever $ do
275274
msg <- peekPersistentQueue queue
276-
(putMessage conn protocolVersion port msg >> popPersistentQueue queue msg)
275+
(putMessage conn protocolVersion ourHost msg >> popPersistentQueue queue msg)
277276
`catch` \case
278277
GrpcException{grpcError, grpcErrorMessage}
279278
| grpcError == GrpcUnavailable || grpcError == GrpcDeadlineExceeded -> do
@@ -286,10 +285,11 @@ putMessage ::
286285
ToCBOR msg =>
287286
Connection ->
288287
HydraVersionedProtocolNumber ->
289-
PortNumber ->
288+
-- | Used to identify sender.
289+
Host ->
290290
msg ->
291291
IO ()
292-
putMessage conn protocolVersion port msg =
292+
putMessage conn protocolVersion ourHost msg =
293293
void $ nonStreaming conn (rpc @(Protobuf KV "put")) req
294294
where
295295
req =
@@ -298,7 +298,7 @@ putMessage conn protocolVersion port msg =
298298
& #value .~ serialize' msg
299299

300300
-- TODO: use one key again (after mapping version check)?
301-
key = encodeUtf8 @Text $ "msg-" <> show (hydraVersionedProtocolNumber protocolVersion) <> "-" <> show port
301+
key = encodeUtf8 @Text $ "msg-" <> show (hydraVersionedProtocolNumber protocolVersion) <> "-" <> show ourHost
302302

303303
-- | Fetch and wait for messages from the etcd cluster.
304304
waitMessages ::
@@ -386,45 +386,53 @@ pollConnectivity tracer conn advertise NetworkCallback{onConnectivity} = do
386386
onConnectivity NetworkConnected
387387
-- Write our alive key using lease
388388
writeAlive leaseId
389+
traceWith tracer CreatedLease{leaseId}
389390
withKeepAlive leaseId $ \keepAlive ->
390391
forever $ do
391392
-- Keep our lease alive
392-
keepAlive
393+
ttlRemaining <- keepAlive
394+
when (ttlRemaining < 1) $
395+
traceWith tracer LowLeaseTTL{ttlRemaining}
393396
-- Determine alive peers
394397
alive <- getAlive
398+
traceWith tracer CurrentlyAlive{alive}
395399
let othersAlive = alive \\ [advertise]
396400
seenAlive <- atomically $ swapTVar seenAliveVar othersAlive
397401
forM_ (othersAlive \\ seenAlive) $ onConnectivity . PeerConnected
398402
forM_ (seenAlive \\ othersAlive) $ onConnectivity . PeerDisconnected
399-
threadDelay 1
403+
-- Wait roughly ttl / 2
404+
threadDelay (ttlRemaining / 2)
400405
where
401-
ttl = 3
402-
403406
onGrpcException seenAliveVar GrpcException{grpcError}
404407
| grpcError == GrpcUnavailable || grpcError == GrpcDeadlineExceeded = do
405408
onConnectivity NetworkDisconnected
406409
atomically $ writeTVar seenAliveVar []
407410
threadDelay 1
408411
onGrpcException _ e = throwIO e
409412

410-
-- REVIEW: server can decide ttl?
411413
createLease = withGrpcContext "createLease" $ do
412414
leaseResponse <-
413415
nonStreaming conn (rpc @(Protobuf Lease "leaseGrant")) $
414-
defMessage & #ttl .~ ttl
416+
defMessage & #ttl .~ 3
415417
pure $ leaseResponse ^. #id
416418

419+
withKeepAlive leaseId action = do
420+
biDiStreaming conn (rpc @(Protobuf Lease "leaseKeepAlive")) $ \send recv -> do
421+
void . action $ do
422+
send $ NextElem $ defMessage & #id .~ leaseId
423+
recv >>= \case
424+
NextElem res -> pure . fromIntegral $ res ^. #ttl
425+
NoNextElem -> do
426+
traceWith tracer NoKeepAliveResponse
427+
pure 0
428+
417429
writeAlive leaseId = withGrpcContext "writeAlive" $ do
418430
void . nonStreaming conn (rpc @(Protobuf KV "put")) $
419431
defMessage
420432
& #key .~ "alive-" <> show advertise
421433
& #value .~ serialize' advertise
422434
& #lease .~ leaseId
423435

424-
withKeepAlive leaseId action = do
425-
biDiStreaming conn (rpc @(Protobuf Lease "leaseKeepAlive")) $ \send _recv ->
426-
void . action $ send $ NextElem (defMessage & #id .~ leaseId)
427-
428436
getAlive = withGrpcContext "getAlive" $ do
429437
res <-
430438
nonStreaming conn (rpc @(Protobuf KV "range")) $
@@ -550,6 +558,10 @@ data EtcdLog
550558
| BroadcastFailed {reason :: Text}
551559
| FailedToDecodeLog {log :: Text, reason :: Text}
552560
| FailedToDecodeValue {key :: Text, value :: Text, reason :: Text}
561+
| CreatedLease {leaseId :: Int64}
562+
| LowLeaseTTL {ttlRemaining :: DiffTime}
563+
| CurrentlyAlive {alive :: [Host]}
564+
| NoKeepAliveResponse
553565
deriving stock (Eq, Show, Generic)
554566
deriving anyclass (ToJSON)
555567

hydra-node/test/Hydra/NetworkSpec.hs

+3-3
Original file line numberDiff line numberDiff line change
@@ -139,17 +139,17 @@ spec = do
139139

140140
it "checks protocol version" $ \tracer -> do
141141
withTempDir "test-etcd" $ \tmp -> do
142-
failAfter 5 $ do
142+
failAfter 10 $ do
143143
PeerConfig2{aliceConfig, bobConfig} <- setup2Peers tmp
144144
let v2 = MkHydraVersionedProtocolNumber 2
145145
withEtcdNetwork @Int tracer v1 aliceConfig noopCallback $ \n1 -> do
146146
(recordReceived, _, waitConnectivity) <- newRecordingCallback
147147
withEtcdNetwork @Int tracer v2 bobConfig recordReceived $ \_n2 -> do
148148
broadcast n1 123
149149

150-
waitEq waitConnectivity 10 $
150+
waitEq waitConnectivity 5 $
151151
HandshakeFailure
152-
{ remoteHost = Host "???" aliceConfig.advertise.port
152+
{ remoteHost = aliceConfig.advertise
153153
, ourVersion = v2
154154
, theirVersions = KnownHydraVersions [v1]
155155
}

0 commit comments

Comments
 (0)