Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Error reporting for network & consensus errors #4015

Merged
merged 6 commits into from
Sep 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/interface-CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ team. See [consensus
CHANGELOG](../ouroboros-consensus/docs/interface-CHANGELOG.md) file for how
this changelog is supposed to be used.

## Circa 2022-09-20

- 'InitializationTracer' type renamed as 'DiffusionTracer'.
- The 'dtDiffusionInitializationTracer' record field of
'Ouoroboros.Network.Diffusion.Tracers' record renamed as 'dtDiffusionTracer'.

## Circa 2022-07-25

- renamed `TrError` as `TrConnectionHandlerError` which is a constructor of
Expand Down
6 changes: 6 additions & 0 deletions ouroboros-consensus/docs/interface-CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,12 @@ may appear out of chronological order.
The internals of each entry are organized similar to
https://keepachangelog.com/en/1.1.0/, adapted to our plan explained above.

## Circa 2022-09-20

### Added

- `consensusStartupErrorTracer` to `Ouroboros.Consensus.Node.Tracers.Tracers'`: a tracer which logs consensus startup errors.

## Circa 2022-08-08

### Added
Expand Down
173 changes: 93 additions & 80 deletions ouroboros-consensus/src/Ouroboros/Consensus/Node.hs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ module Ouroboros.Consensus.Node (
) where

import Codec.Serialise (DeserialiseFailure)
import Control.Tracer (Tracer, contramap)
import Control.Tracer (Tracer, contramap, traceWith)
import Data.ByteString.Lazy (ByteString)
import Data.Hashable (Hashable)
import Data.Map.Strict (Map)
Expand All @@ -73,9 +73,9 @@ import Ouroboros.Network.NodeToClient (ConnectionId, LocalAddress,
LocalSocket, NodeToClientVersionData (..), combineVersions,
simpleSingletonVersions)
import Ouroboros.Network.NodeToNode (DiffusionMode (..),
MiniProtocolParameters, NodeToNodeVersionData (..),
RemoteAddress, Socket, blockFetchPipeliningMax,
defaultMiniProtocolParameters)
ExceptionInHandler (..), MiniProtocolParameters,
NodeToNodeVersionData (..), RemoteAddress, Socket,
blockFetchPipeliningMax, defaultMiniProtocolParameters)
import Ouroboros.Network.PeerSelection.LedgerPeers
(LedgerPeersConsensusInterface (..))
import Ouroboros.Network.PeerSelection.PeerMetric (PeerMetrics,
Expand Down Expand Up @@ -277,89 +277,102 @@ run args stdArgs = stdLowLevelRunNodeArgsIO args stdArgs >>= runWith args
runWith :: forall m addrNTN addrNTC versionDataNTN versionDataNTC blk p2p.
( RunNode blk
, IOLike m, MonadTime m, MonadTimer m
, Hashable addrNTN, Ord addrNTN, Typeable addrNTN
, Hashable addrNTN, Ord addrNTN, Show addrNTN, Typeable addrNTN
)
=> RunNodeArgs m addrNTN addrNTC blk p2p
-> LowLevelRunNodeArgs m addrNTN addrNTC versionDataNTN versionDataNTC blk p2p
-> m ()
runWith RunNodeArgs{..} LowLevelRunNodeArgs{..} =

llrnWithCheckedDB $ \(LastShutDownWasClean lastShutDownWasClean) continueWithCleanChainDB ->
withRegistry $ \registry -> do
let systemStart :: SystemStart
systemStart = getSystemStart (configBlock cfg)

systemTime :: SystemTime m
systemTime = defaultSystemTime
systemStart
(blockchainTimeTracer rnTraceConsensus)

inFuture :: CheckInFuture m blk
inFuture = InFuture.reference
(configLedger cfg)
llrnMaxClockSkew
systemTime

let customiseChainDbArgs' args
| lastShutDownWasClean
= llrnCustomiseChainDbArgs args
| otherwise
-- When the last shutdown was not clean, validate the complete
-- ChainDB to detect and recover from any corruptions. This will
-- override the default value /and/ the user-customised value of
-- the 'ChainDB.cdbImmValidation' and the
-- 'ChainDB.cdbVolValidation' fields.
= (llrnCustomiseChainDbArgs args) {
ChainDB.cdbImmutableDbValidation = ValidateAllChunks
, ChainDB.cdbVolatileDbValidation = ValidateAll
}

chainDB <- openChainDB registry inFuture cfg initLedger
llrnChainDbArgsDefaults customiseChainDbArgs'

continueWithCleanChainDB chainDB $ do
btime <-
hardForkBlockchainTime $
llrnCustomiseHardForkBlockchainTimeArgs $
HardForkBlockchainTimeArgs
{ hfbtBackoffDelay = pure $ BackoffDelay 60
, hfbtGetLedgerState =
ledgerState <$> ChainDB.getCurrentLedger chainDB
, hfbtLedgerConfig = configLedger cfg
, hfbtRegistry = registry
, hfbtSystemTime = systemTime
, hfbtTracer =
contramap (fmap (fromRelativeTime systemStart))
(blockchainTimeTracer rnTraceConsensus)
, hfbtMaxClockRewind = secondsToNominalDiffTime 20
}

nodeKernelArgs <-
fmap (nodeKernelArgsEnforceInvariants . llrnCustomiseNodeKernelArgs) $
mkNodeKernelArgs
registry
llrnBfcSalt
llrnKeepAliveRng
cfg
blockForging
rnTraceConsensus
btime
chainDB
nodeKernel <- initNodeKernel nodeKernelArgs
rnNodeKernelHook registry nodeKernel

peerMetrics <- newPeerMetric Diffusion.peerMetricsConfiguration
let ntnApps = mkNodeToNodeApps nodeKernelArgs nodeKernel peerMetrics
ntcApps = mkNodeToClientApps nodeKernelArgs nodeKernel
(apps, appsExtra) = mkDiffusionApplications
rnEnableP2P
(miniProtocolParameters nodeKernelArgs)
ntnApps
ntcApps
nodeKernel
peerMetrics

llrnRunDataDiffusion registry apps appsExtra
withRegistry $ \registry ->
handleJust
-- ignore exception thrown in connection handlers and diffusion
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should provide a rationale on why we ignore these errors.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They are already logged by the network layer. I'll update the comment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Included, I also caught a freshman mistake, check the diff 😁

-- initialisation failures; these errors are logged by the network
-- layer.
(\err -> case fromException err :: Maybe ExceptionInHandler of
Just _ -> Nothing
Nothing ->
case fromException err :: Maybe (Diffusion.Failure addrNTN) of
Just _ -> Nothing
Nothing -> Just err)
(\err -> traceWith (consensusStartupErrorTracer rnTraceConsensus) err
>> throwIO err
) $ do
let systemStart :: SystemStart
systemStart = getSystemStart (configBlock cfg)

systemTime :: SystemTime m
systemTime = defaultSystemTime
systemStart
(blockchainTimeTracer rnTraceConsensus)

inFuture :: CheckInFuture m blk
inFuture = InFuture.reference
(configLedger cfg)
llrnMaxClockSkew
systemTime

let customiseChainDbArgs' args
| lastShutDownWasClean
= llrnCustomiseChainDbArgs args
| otherwise
-- When the last shutdown was not clean, validate the complete
-- ChainDB to detect and recover from any corruptions. This will
-- override the default value /and/ the user-customised value of
-- the 'ChainDB.cdbImmValidation' and the
-- 'ChainDB.cdbVolValidation' fields.
= (llrnCustomiseChainDbArgs args) {
ChainDB.cdbImmutableDbValidation = ValidateAllChunks
, ChainDB.cdbVolatileDbValidation = ValidateAll
}

chainDB <- openChainDB registry inFuture cfg initLedger
llrnChainDbArgsDefaults customiseChainDbArgs'

continueWithCleanChainDB chainDB $ do
btime <-
hardForkBlockchainTime $
llrnCustomiseHardForkBlockchainTimeArgs $
HardForkBlockchainTimeArgs
{ hfbtBackoffDelay = pure $ BackoffDelay 60
, hfbtGetLedgerState =
ledgerState <$> ChainDB.getCurrentLedger chainDB
, hfbtLedgerConfig = configLedger cfg
, hfbtRegistry = registry
, hfbtSystemTime = systemTime
, hfbtTracer =
contramap (fmap (fromRelativeTime systemStart))
(blockchainTimeTracer rnTraceConsensus)
, hfbtMaxClockRewind = secondsToNominalDiffTime 20
}

nodeKernelArgs <-
fmap (nodeKernelArgsEnforceInvariants . llrnCustomiseNodeKernelArgs) $
mkNodeKernelArgs
registry
llrnBfcSalt
llrnKeepAliveRng
cfg
blockForging
rnTraceConsensus
btime
chainDB
nodeKernel <- initNodeKernel nodeKernelArgs
rnNodeKernelHook registry nodeKernel

peerMetrics <- newPeerMetric Diffusion.peerMetricsConfiguration
let ntnApps = mkNodeToNodeApps nodeKernelArgs nodeKernel peerMetrics
ntcApps = mkNodeToClientApps nodeKernelArgs nodeKernel
(apps, appsExtra) = mkDiffusionApplications
rnEnableP2P
(miniProtocolParameters nodeKernelArgs)
ntnApps
ntcApps
nodeKernel
peerMetrics

llrnRunDataDiffusion registry apps appsExtra
where
ProtocolInfo
{ pInfoConfig = cfg
Expand Down
5 changes: 5 additions & 0 deletions ouroboros-consensus/src/Ouroboros/Consensus/Node/Tracers.hs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ module Ouroboros.Consensus.Node.Tracers (
, TraceLabelCreds (..)
) where

import Control.Exception (SomeException)
import Control.Tracer (Tracer, nullTracer, showTracing)
import Data.Text (Text)
import Data.Time (UTCTime)
Expand Down Expand Up @@ -63,6 +64,7 @@ data Tracers' remotePeer localPeer blk f = Tracers
, blockchainTimeTracer :: f (TraceBlockchainTimeEvent UTCTime)
, forgeStateInfoTracer :: f (TraceLabelCreds (ForgeStateInfo blk))
, keepAliveClientTracer :: f (TraceKeepAliveClient remotePeer)
, consensusStartupErrorTracer :: f SomeException
}

instance (forall a. Semigroup (f a))
Expand All @@ -82,6 +84,7 @@ instance (forall a. Semigroup (f a))
, blockchainTimeTracer = f blockchainTimeTracer
, forgeStateInfoTracer = f forgeStateInfoTracer
, keepAliveClientTracer = f keepAliveClientTracer
, consensusStartupErrorTracer = f consensusStartupErrorTracer
}
where
f :: forall a. Semigroup a
Expand Down Expand Up @@ -109,6 +112,7 @@ nullTracers = Tracers
, blockchainTimeTracer = nullTracer
, forgeStateInfoTracer = nullTracer
, keepAliveClientTracer = nullTracer
, consensusStartupErrorTracer = nullTracer
}

showTracers :: ( Show blk
Expand Down Expand Up @@ -139,6 +143,7 @@ showTracers tr = Tracers
, blockchainTimeTracer = showTracing tr
, forgeStateInfoTracer = showTracing tr
, keepAliveClientTracer = showTracing tr
, consensusStartupErrorTracer = showTracing tr
}

{-------------------------------------------------------------------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ makeConnectionHandler muxTracer singMuxMode
throwTo mainThreadId (ExceptionInHandler remoteAddress err)
throwIO err
ShutdownPeer ->
throwIO err
throwIO (ExceptionInHandler remoteAddress err)

outboundConnectionHandler
:: HasInitiator muxMode ~ True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -370,22 +370,23 @@ data Inactive =
deriving (Eq, Show)


-- | Exception which where caught in the connection thread and were re-thrown
-- in the main thread by the 'rethrowPolicy'.
-- | Exception which where caught in the connection thread and were re-thrown in
-- the main thread by the 'rethrowPolicy'.
--
data ExceptionInHandler peerAddr where
ExceptionInHandler :: !peerAddr
data ExceptionInHandler where
ExceptionInHandler :: forall peerAddr.
(Typeable peerAddr, Show peerAddr)
=> !peerAddr
-> !SomeException
-> ExceptionInHandler peerAddr
-> ExceptionInHandler
deriving Typeable

instance Show peerAddr => Show (ExceptionInHandler peerAddr) where
instance Show ExceptionInHandler where
show (ExceptionInHandler peerAddr e) = "ExceptionInHandler "
++ show peerAddr
++ " "
++ show e
instance ( Show peerAddr
, Typeable peerAddr ) => Exception (ExceptionInHandler peerAddr)
instance Exception ExceptionInHandler


-- | Data type used to classify 'handleErrors'.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,12 @@ instance Monoid ErrorCommand where
mempty = ShutdownPeer


-- | Whether an exception happend on outbound or inbound connection.
-- | Whether an exception happened on outbound or inbound connection.
--
-- TODO: It would be more useful to have access whether the exception happend
-- on initiator or responder. The easiest way to fix this is make mux throw the
-- exception together with context. This allows to keep error handling be done
-- only by the connection manager (rather than by server and
-- TODO: It would be more useful to have access to whether the exception
-- happened on initiator or responder. The easiest way to fix this is make mux
-- throw the exception together with context. This allows to keep error
-- handling be done only by the connection manager (rather than by server and
-- 'PeerStateActions').
--
data ErrorContext = OutboundError
Expand Down
2 changes: 1 addition & 1 deletion ouroboros-network/src/Ouroboros/Network/Diffusion.hs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
module Ouroboros.Network.Diffusion
( -- * Common API
P2P (..)
, InitializationTracer (..)
, DiffusionTracer (..)
, Tracers (..)
, nullTracers
, ExtraTracers (..)
Expand Down
25 changes: 14 additions & 11 deletions ouroboros-network/src/Ouroboros/Network/Diffusion/Common.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
{-# LANGUAGE DataKinds #-}

module Ouroboros.Network.Diffusion.Common
( InitializationTracer (..)
( DiffusionTracer (..)
, Failure (..)
, Tracers (..)
, nullTracers
Expand Down Expand Up @@ -31,9 +31,12 @@ import Ouroboros.Network.PeerSelection.LedgerPeers
import Ouroboros.Network.Snocket (FileDescriptor)
import Ouroboros.Network.Socket (SystemdSocketTracer)

-- TODO: use LocalAddress where appropriate rather than 'path'.
-- | The 'DiffusionTracer' logs
--
data InitializationTracer ntnAddr ntcAddr
-- * diffusion initialisation messages
-- * terminal errors thrown by diffusion
--
data DiffusionTracer ntnAddr ntcAddr
= RunServer (NonEmpty ntnAddr)
| RunLocalServer ntcAddr
| UsingSystemdSocket ntcAddr
Expand Down Expand Up @@ -89,8 +92,8 @@ data Tracers ntnAddr ntnVersion ntcAddr ntcVersion m = Tracers {
:: Tracer m (NodeToClient.HandshakeTr ntcAddr ntcVersion)

-- | Diffusion initialisation tracer
, dtDiffusionInitializationTracer
:: Tracer m (InitializationTracer ntnAddr ntcAddr)
, dtDiffusionTracer
:: Tracer m (DiffusionTracer ntnAddr ntcAddr)

-- | Ledger Peers tracer
, dtLedgerPeersTracer
Expand All @@ -103,12 +106,12 @@ nullTracers :: Applicative m
ntcAddr ntcVersion
m
nullTracers = Tracers {
dtMuxTracer = nullTracer
, dtHandshakeTracer = nullTracer
, dtLocalMuxTracer = nullTracer
, dtLocalHandshakeTracer = nullTracer
, dtDiffusionInitializationTracer = nullTracer
, dtLedgerPeersTracer = nullTracer
dtMuxTracer = nullTracer
, dtHandshakeTracer = nullTracer
, dtLocalMuxTracer = nullTracer
, dtLocalHandshakeTracer = nullTracer
, dtDiffusionTracer = nullTracer
, dtLedgerPeersTracer = nullTracer
}

-- | Common DiffusionArguments interface between P2P and NonP2P
Expand Down
Loading