cardano-scaling · ffakenz · Mar 6, 2025 · Mar 9, 2025 · Mar 9, 2025 · Mar 9, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -54,9 +54,13 @@ ETCD_AUTO_COMPACTION_RETENTION=168h
 > [!NOTE]
 > Only variables prefixed with `ETCD_` are passed on to the `etcd` process.
 
-
 - Add a list of [clients](https://hydra.family/head-protocol/unstable/docs/clients) to the docs
 
+- Add API command (POST /snapshot) which allows to adopt the given snapshot as the latest confirmed.
+  * add new `SideLoadSnapshot` client input.
+  * add new `SnapshotSideLoaded` state changed event.
+  * add new `SnapshotSideLoaded` server output.
+
 ## [0.20.0] - 2025-02-04
 
 - **BETA** hydra-node now supports incremental commits in beta mode. We would like to test out this feature

diff --git a/docs/docs/how-to/operating-hydra.md b/docs/docs/how-to/operating-hydra.md
@@ -70,3 +70,19 @@ hydra_head_tx_confirmation_time_ms_count  0
 
 * Confirm peers are properly connected to each other. Verify the `--peer` arguments point to the correct `host:port` for each peer. The `PeerConnected` message should be observed by the client or appear in the logs and be consistent across all peers involved in a head.
 * Ensure the _Hydra signing key_ for your node or the _Hydra verification keys_ for peers match each node's expectations. Verify that `AckSn` messages are received by all parties and that the `LogicOutcome` log contains no errors.
+
+### Head Stuck: Peer Out of Sync
+
+Processing transactions in a Hydra head requires each node to agree on transactions, which occurs when they sign a snapshot during the AckSn phase. The protocol validates transactions (on the NewTx command) against its local view of the ledger state, using the provided --ledger-protocol-parameters. Since transaction validity depends on configuration (and, to some extent, the exact build versions of hydra-node), one node may accept a transaction while its peers reject it.
+
+When this happens, the accepting node's local state diverges from the rest, potentially leading to attempts to spend outputs that other nodes do not recognize as available.
+
+This issue can also arise if a peer goes offline while a transaction is submitted, potentially causing the Hydra head to become stuck and preventing further snapshots from being signed ([see test case](https://github.com/cardano-scaling/hydra/pull/1780)).
+
+As a result of this divergence, the local ledger state of each Hydra node essentially becomes forked, resulting in inconsistent states across the network. While it is technically still possible to submit transactions to the Hydra nodes, doing so is ineffective because snapshots do not update unless all nodes sign them. Each node starts accepting transactions based on entirely different states, leading to disagreement on which UTxOs have been spent.
+
+To recover from this issue, we introduced side-loading of snapshots to synchronize the local ledger state of the Hydra nodes. With this mechanism, every peer reverts to the latest confirmed snapshot. This can be done using the POST /snapshot/latest endpoint, which clears the peer’s local pending transactions and restores its state to match the last agreed snapshot, allowing the node to rejoin the consensus with the rest of the network.
+
+Newer confirmed snapshots can also be adopted if all party members use the POST /snapshot endpoint with the same ConfirmedSnapshot as the JSON body.
+
+It is important to note that this recovery process is a coordinated effort among peers to ensure the consistency and availability of the Hydra head.
diff --git a/hydra-cluster/src/Hydra/Cluster/Scenarios.hs b/hydra-cluster/src/Hydra/Cluster/Scenarios.hs
@@ -26,10 +26,10 @@ import CardanoClient (
  )
 import CardanoNode (NodeLog)
 import Control.Concurrent.Async (mapConcurrently_)
-import Control.Lens ((.~), (^.), (^..), (^?))
+import Control.Lens ((.~), (?~), (^.), (^..), (^?))
 import Data.Aeson (Value, object, (.=))
 import Data.Aeson qualified as Aeson
-import Data.Aeson.Lens (key, values, _JSON, _String)
+import Data.Aeson.Lens (atKey, key, values, _JSON, _String)
 import Data.Aeson.Types (parseMaybe)
 import Data.ByteString (isInfixOf)
 import Data.ByteString qualified as B
@@ -104,10 +104,12 @@ import Hydra.Tx.Utils (dummyValidatorScript, verificationKeyToOnChainId)
 import HydraNode (
   HydraClient (..),
   HydraNodeLog,
+  getSnapshotConfirmed,
   getSnapshotUTxO,
   input,
   output,
   postDecommit,
+  prepareHydraNode,
   requestCommitTx,
   send,
   waitFor,
@@ -116,6 +118,7 @@ import HydraNode (
   waitMatch,
   withHydraCluster,
   withHydraNode,
+  withPreparedHydraNode,
  )
 import Network.HTTP.Conduit (parseUrlThrow)
 import Network.HTTP.Conduit qualified as L
@@ -1305,6 +1308,112 @@ canDecommit tracer workDir node hydraScriptsTxId =
 
   RunningNode{networkId, nodeSocket, blockTime} = node
 
+-- | Can side load snapshot and resume agreement after a peer comes back online with healthy configuration
+canSideLoadSnapshot :: Tracer IO EndToEndLog -> FilePath -> RunningNode -> [TxId] -> IO ()
+canSideLoadSnapshot tracer workDir cardanoNode hydraScriptsTxId = do
+  let clients = [Alice, Bob, Carol]
+  [(aliceCardanoVk, aliceCardanoSk), (bobCardanoVk, _), (carolCardanoVk, _)] <- forM clients keysFor
+  seedFromFaucet_ cardanoNode aliceCardanoVk 100_000_000 (contramap FromFaucet tracer)
+  seedFromFaucet_ cardanoNode bobCardanoVk 100_000_000 (contramap FromFaucet tracer)
+  seedFromFaucet_ cardanoNode carolCardanoVk 100_000_000 (contramap FromFaucet tracer)
+
+  let contestationPeriod = UnsafeContestationPeriod 1
+  let depositDeadline = UnsafeDepositDeadline 200
+  aliceChainConfig <-
+    chainConfigFor Alice workDir nodeSocket hydraScriptsTxId [Bob, Carol] contestationPeriod depositDeadline
+      <&> setNetworkId networkId
+  bobChainConfig <-
+    chainConfigFor Bob workDir nodeSocket hydraScriptsTxId [Alice, Carol] contestationPeriod depositDeadline
+      <&> setNetworkId networkId
+  carolChainConfig <-
+    chainConfigFor Carol workDir nodeSocket hydraScriptsTxId [Alice, Bob] contestationPeriod depositDeadline
+      <&> setNetworkId networkId
+
+  withHydraNode hydraTracer aliceChainConfig workDir 1 aliceSk [bobVk, carolVk] [1, 2, 3] $ \n1 -> do
+    aliceUTxO <- seedFromFaucet cardanoNode aliceCardanoVk 1_000_000 (contramap FromFaucet tracer)
+    withHydraNode hydraTracer bobChainConfig workDir 2 bobSk [aliceVk, carolVk] [1, 2, 3] $ \n2 -> do
+      -- Carol starts its node missconfigured
+      let pparamsDecorator = atKey "maxTxSize" ?~ toJSON (Aeson.Number 0)
+      wrongOptions <- prepareHydraNode carolChainConfig workDir 3 carolSk [aliceVk, bobVk] [1, 2, 3] pparamsDecorator
+      tx <- withPreparedHydraNode hydraTracer workDir 3 wrongOptions $ \n3 -> do
+        -- Init
+        send n1 $ input "Init" []
+        headId <- waitForAllMatch (10 * blockTime) [n1, n2, n3] $ headIsInitializingWith (Set.fromList [alice, bob, carol])
+
+        -- Alice commits something
+        requestCommitTx n1 aliceUTxO >>= submitTx cardanoNode
+
+        -- Everyone else commits nothing
+        mapConcurrently_ (\n -> requestCommitTx n mempty >>= submitTx cardanoNode) [n2, n3]
+
+        -- Observe open with the relevant UTxOs
+        waitFor hydraTracer (20 * blockTime) [n1, n2, n3] $
+          output "HeadIsOpen" ["utxo" .= toJSON aliceUTxO, "headId" .= headId]
+
+        -- Alice submits a new transaction
+        utxo <- getSnapshotUTxO n1
+        tx <- mkTransferTx testNetworkId utxo aliceCardanoSk aliceCardanoVk
+        send n1 $ input "NewTx" ["transaction" .= tx]
+
+        -- Alice and Bob accept it
+        waitForAllMatch (200 * blockTime) [n1, n2] $ \v -> do
+          guard $ v ^? key "tag" == Just "TxValid"
+          guard $ v ^? key "transactionId" == Just (toJSON $ txId tx)
+
+        -- Carol does not because of its node being missconfigured
+        waitMatch 3 n3 $ \v -> do
+          guard $ v ^? key "tag" == Just "TxInvalid"
+          guard $ v ^? key "transaction" . key "txId" == Just (toJSON $ txId tx)
+
+        pure tx
+
+      -- Carol disconnects and the others observe it
+      waitForAllMatch (100 * blockTime) [n1, n2] $ \v -> do
+        guard $ v ^? key "tag" == Just "PeerDisconnected"
+
+      -- Carol reconnects with healthy reconfigured node
+      withHydraNode hydraTracer carolChainConfig workDir 3 carolSk [aliceVk, bobVk] [1, 2, 3] $ \n3 -> do
+        -- Carol re-submits the same transaction
+        send n3 $ input "NewTx" ["transaction" .= tx]
+        -- Carol accepts it
+        waitMatch 3 n3 $ \v -> do
+          guard $ v ^? key "tag" == Just "TxValid"
+          guard $ v ^? key "transactionId" == Just (toJSON $ txId tx)
+        -- But now Alice and Bob does not because they already applied it
+        waitForAllMatch (200 * blockTime) [n1, n2] $ \v -> do
+          guard $ v ^? key "tag" == Just "TxInvalid"
+          guard $ v ^? key "transaction" . key "txId" == Just (toJSON $ txId tx)
+
+        -- \| Up to this point the head became stuck and no further SnapshotConfirmed
+        -- including above tx will be seen signed by everyone.
+
+        -- The party side-loads latest confirmed snapshot (which is the initial)
+        -- This also prunes local txs, and discards any signing round inflight
+        snapshotConfirmed <- getSnapshotConfirmed n1
+        flip mapConcurrently_ [n1, n2, n3] $ \n -> do
+          send n $ input "SideLoadSnapshot" ["snapshot" .= snapshotConfirmed]
+          waitMatch (200 * blockTime) n $ \v -> do
+            guard $ v ^? key "tag" == Just "SnapshotSideLoaded"
+            guard $ v ^? key "confirmedSnapshot" == Just (toJSON snapshotConfirmed)
+
+        -- Carol re-submits the same transaction (but anyone can at this point)
+        send n3 $ input "NewTx" ["transaction" .= tx]
+
+        -- Everyone confirms it
+        -- Note: We can't use `waitForAlMatch` here as it expects them to
-        -- Note: We can't use `waitForAlMatch` here as it expects them to
+        -- Note: We can't use `waitForAllMatch` here as it expects them to
-        -- Note: We can't use `waitForAlMatch` here as it expects them to
+        -- Note: We can't use `waitForAllMatch` here as it expects them to
+        -- emit the exact same datatype; but Carol will be behind in sequence
+        -- numbers as she was offline.
+        flip mapConcurrently_ [n1, n2, n3] $ \n ->
+          waitMatch (200 * blockTime) n $ \v -> do
+            guard $ v ^? key "tag" == Just "SnapshotConfirmed"
+            guard $ v ^? key "snapshot" . key "number" == Just (toJSON (1 :: Integer))
+            -- Just check that everyone signed it.
+            let sigs = v ^.. key "signatures" . key "multiSignature" . values
+            guard $ length sigs == 3
+ where
+  RunningNode{nodeSocket, networkId, blockTime} = cardanoNode
+  hydraTracer = contramap FromHydraNode tracer
+
 -- * L2 scenarios
 
 -- | Finds UTxO owned by given key in the head and creates transactions

diff --git a/hydra-cluster/src/HydraNode.hs b/hydra-cluster/src/HydraNode.hs
@@ -25,6 +25,7 @@ import Hydra.Logging (Tracer, Verbosity (..), traceWith)
 import Hydra.Network (Host (Host), NodeId (NodeId))
 import Hydra.Network qualified as Network
 import Hydra.Options (ChainConfig (..), DirectChainConfig (..), LedgerConfig (..), RunOptions (..), defaultDirectChainConfig, toArgs)
+import Hydra.Tx (ConfirmedSnapshot)
 import Hydra.Tx.ContestationPeriod (ContestationPeriod)
 import Hydra.Tx.Crypto (HydraKey)
 import Hydra.Tx.DepositDeadline (DepositDeadline)
@@ -215,6 +216,21 @@ getSnapshotUTxO HydraClient{apiHost = Host{hostname, port}} =
     >>= httpJSON
     <&> getResponseBody
 
+-- | Get the latest snapshot from the hydra-node. NOTE: While we usually
+-- avoid parsing responses using the same data types as the system under test,
+-- this parses the response as a 'ConfirmedSnapshot' type as we often need to pick it apart.
+getSnapshotConfirmed :: HydraClient -> IO (ConfirmedSnapshot Tx)
+getSnapshotConfirmed HydraClient{apiHost = Host{hostname, port}} =
+  runReq defaultHttpConfig request <&> responseBody
+ where
+  request =
+    Req.req
+      GET
+      (Req.http hostname /: "snapshot")
+      NoReqBody
+      (Proxy :: Proxy (JsonResponse (ConfirmedSnapshot Tx)))
+      (Req.port (fromInteger . toInteger $ port))
+
 getMetrics :: HasCallStack => HydraClient -> IO ByteString
 getMetrics HydraClient{hydraNodeId, apiHost = Host{hostname}} = do
   failAfter 3 $
@@ -296,6 +312,131 @@ withHydraCluster tracer workDir nodeSocket firstNodeId allKeys hydraKeys hydraSc
 
 -- * Start / connect to a hydra-node
 
+-- | Prepare protocol-parameters to run a hydra-node with given 'ChainConfig' and using the config from
+-- config/.
+preparePParams ::
+  ChainConfig ->
+  FilePath ->
+  (Aeson.Value -> Aeson.Value) ->
+  IO FilePath
+preparePParams chainConfig stateDir paramsDecorator = do
+  let cardanoLedgerProtocolParametersFile = stateDir </> "protocol-parameters.json"
+  case chainConfig of
+    Offline _ ->
+      readConfigFile "protocol-parameters.json"
+        >>= writeFileBS cardanoLedgerProtocolParametersFile
+    Direct DirectChainConfig{nodeSocket, networkId} -> do
+      -- NOTE: This implicitly tests of cardano-cli with hydra-node
+      protocolParameters <- cliQueryProtocolParameters nodeSocket networkId
+      Aeson.encodeFile cardanoLedgerProtocolParametersFile $
+        paramsDecorator protocolParameters
+          & atKey "txFeeFixed" ?~ toJSON (Number 0)
+          & atKey "txFeePerByte" ?~ toJSON (Number 0)
+          & key "executionUnitPrices" . atKey "priceMemory" ?~ toJSON (Number 0)
+          & key "executionUnitPrices" . atKey "priceSteps" ?~ toJSON (Number 0)
+          & atKey "utxoCostPerByte" ?~ toJSON (Number 0)
+          & atKey "treasuryCut" ?~ toJSON (Number 0)
+          & atKey "minFeeRefScriptCostPerByte" ?~ toJSON (Number 0)
+  pure cardanoLedgerProtocolParametersFile
+
+-- | Prepare 'RunOptions' to run a hydra-node with given 'ChainConfig' and using the config from
+-- config/.
+prepareHydraNode ::
+  HasCallStack =>
+  ChainConfig ->
+  FilePath ->
+  Int ->
+  SigningKey HydraKey ->
+  [VerificationKey HydraKey] ->
+  [Int] ->
+  (Aeson.Value -> Aeson.Value) ->
+  IO RunOptions
+prepareHydraNode chainConfig workDir hydraNodeId hydraSKey hydraVKeys allNodeIds paramsDecorator = do
+  -- NOTE: AirPlay on MacOS uses 5000 and we must avoid it.
+  when (os == "darwin") $ port `shouldNotBe` (5_000 :: Network.PortNumber)
+  let stateDir = workDir </> "state-" <> show hydraNodeId
+  createDirectoryIfMissing True stateDir
+  cardanoLedgerProtocolParametersFile <- preparePParams chainConfig stateDir paramsDecorator
+  let hydraSigningKey = stateDir </> "me.sk"
+  void $ writeFileTextEnvelope (File hydraSigningKey) Nothing hydraSKey
+  hydraVerificationKeys <- forM (zip [1 ..] hydraVKeys) $ \(i :: Int, vKey) -> do
+    let filepath = stateDir </> ("other-" <> show i <> ".vk")
+    filepath <$ writeFileTextEnvelope (File filepath) Nothing vKey
+  pure $
+    RunOptions
+      { verbosity = Verbose "HydraNode"
+      , nodeId = NodeId $ show hydraNodeId
+      , listen = Host "0.0.0.0" (fromIntegral $ 5_000 + hydraNodeId)
+      , advertise = Nothing
+      , peers
+      , apiHost = "0.0.0.0"
+      , apiPort = fromIntegral $ 4_000 + hydraNodeId
+      , tlsCertPath = Nothing
+      , tlsKeyPath = Nothing
+      , monitoringPort = Just $ fromIntegral $ 6_000 + hydraNodeId
+      , hydraSigningKey
+      , hydraVerificationKeys
+      , persistenceDir = stateDir
+      , chainConfig
+      , ledgerConfig =
+          CardanoLedgerConfig
+            { cardanoLedgerProtocolParametersFile
+            }
+      }
+ where
+  port = fromIntegral $ 5_000 + hydraNodeId
+  -- NOTE: See comment above about 0.0.0.0 vs 127.0.0.1
+  peers =
+    [ Host
+        { Network.hostname = "0.0.0.0"
+        , Network.port = fromIntegral $ 5_000 + i
+        }
+    | i <- allNodeIds
+    , i /= hydraNodeId
+    ]
+
+-- | Run a hydra-node with given 'RunOptions'.
+withPreparedHydraNode ::
+  HasCallStack =>
+  Tracer IO HydraNodeLog ->
+  FilePath ->
+  Int ->
+  RunOptions ->
+  (HydraClient -> IO a) ->
+  IO a
+withPreparedHydraNode tracer workDir hydraNodeId runOptions action =
+  withLogFile logFilePath $ \logFileHandle -> do
+    -- XXX: using a dedicated pipe as 'createPipe' from typed-process closes too early
+    (readErr, writeErr) <- P.createPipe
+    let cmd =
+          (proc "hydra-node" . toArgs $ runOptions)
+            & setStdout (useHandleOpen logFileHandle)
+            & setStderr (useHandleOpen writeErr)
+
+    traceWith tracer $ HydraNodeCommandSpec $ show cmd
+
+    withProcessTerm cmd $ \p -> do
+      hClose writeErr
+      -- NOTE: exit code thread gets cancelled if 'action' terminates first
+      race
+        (collectAndCheckExitCode p readErr)
+        (withConnectionToNode tracer hydraNodeId action)
+        <&> either absurd id
+ where
+  collectAndCheckExitCode p h =
+    (`finally` hClose h) $
+      waitExitCode p >>= \case
+        ExitSuccess -> failure "hydra-node stopped early"
+        ExitFailure ec -> do
+          err <- hGetContents h
+          failure . toString $
+            unlines
+              [ "hydra-node (nodeId = " <> show hydraNodeId <> ") exited with failure code: " <> show ec
+              , decodeUtf8 err
+              ]
+
+  logFilePath = workDir </> "logs" </> "hydra-node-" <> show hydraNodeId <.> "log"
+
 -- | Run a hydra-node with given 'ChainConfig' and using the config from
 -- config/.
 withHydraNode ::

diff --git a/hydra-cluster/test/Test/EndToEndSpec.hs b/hydra-cluster/test/Test/EndToEndSpec.hs
@@ -55,6 +55,7 @@ import Hydra.Cluster.Scenarios (
   canDecommit,
   canRecoverDeposit,
   canSeePendingDeposits,
+  canSideLoadSnapshot,
   canSubmitTransactionThroughAPI,
   headIsInitializingWith,
   initWithWrongKeys,
@@ -465,6 +466,12 @@ spec = around (showLogsOnFailure "EndToEndSpec") $ do
                     guard $ v ^? key "tag" == Just "HeadIsContested"
                     guard $ v ^? key "headId" == Just (toJSON headId)
 
+      it "can side load snapshot" $ \tracer -> do
+        withClusterTempDir $ \tmpDir -> do
+          withCardanoNodeDevnet (contramap FromCardanoNode tracer) tmpDir $ \node ->
+            publishHydraScriptsAs node Faucet
+              >>= canSideLoadSnapshot tracer tmpDir node
+
     describe "two hydra heads scenario" $ do
       it "two heads on the same network do not conflict" $ \tracer ->
         failAfter 60 $