Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(lit-core): LIT-4016 - Enhance error handling during epoch changes #710

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 4 additions & 16 deletions local-tests/setup/tinny-environment.ts
Original file line number Diff line number Diff line change
Expand Up @@ -261,23 +261,11 @@ export class TinnyEnvironment {
throw new Error(`Network not supported: "${this.network}"`);
}

if (globalThis.wasmExports) {
console.warn(
'WASM modules already loaded. Will overide when connect is called'
);
}

if (globalThis.wasmECDSA) {
console.warn(
'WASM modules already loaded. wil overide. when connect is called'
);
}

if (globalThis.wasmSevSnpUtils) {
console.warn(
'WASM modules already loaded. wil overide. when connect is called'
this.litNodeClient.on('connected', () => {
console.log(
'Received `connected` event from `litNodeClient. Ready to go!'
);
}
});

await this.litNodeClient.connect();

Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
"dotenv-parse-variables": "^2.0.0",
"download": "^8.0.0",
"ethers": "^5.7.1",
"eventemitter3": "^5.0.1",
"jose": "^4.14.4",
"jszip": "^3.10.1",
"micromodal": "^0.4.10",
Expand Down
93 changes: 52 additions & 41 deletions packages/core/src/lib/lit-core.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { ethers } from 'ethers';
import { EventEmitter } from 'eventemitter3';

import {
canonicalAccessControlConditionFormatter,
Expand Down Expand Up @@ -63,7 +64,6 @@ import {
NodeClientErrorV0,
NodeClientErrorV1,
NodeCommandServerKeysResponse,
NodeErrorV3,
RejectedNodePromises,
SendNodeCommand,
SessionSigsMap,
Expand Down Expand Up @@ -132,7 +132,7 @@ const FALLBACK_RPC_URLS = [
'https://eth.llamarpc.com',
];

export class LitCore {
export class LitCore extends EventEmitter {
config: LitNodeClientConfigWithDefaults = {
alertWhenUnauthorized: false,
debug: true,
Expand Down Expand Up @@ -165,6 +165,8 @@ export class LitCore {

// ========== Constructor ==========
constructor(config: LitNodeClientConfig | CustomNetwork) {
super();

if (!(config.litNetwork in LIT_NETWORKS)) {
const supportedNetwork = Object.values(LIT_NETWORK).join(', ');

Expand Down Expand Up @@ -308,20 +310,16 @@ export class LitCore {

// ========== Scoped Class Helpers ==========
private async _handleStakingContractStateChange(state: StakingStates) {
log(`New state detected: "${state}"`);

const validatorData = await this._getValidatorData();
try {
log(`New state detected: "${state}"`);

if (state === StakingStates.Active) {
// We always want to track the most recent epoch number on _all_ networks

this._epochState = await this._fetchCurrentEpochState(
validatorData.epochInfo
);
if (state === StakingStates.Active) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we throw/disconnect when the new state is Paused?

const validatorData = await this._getValidatorData();

if (CENTRALISATION_BY_NETWORK[this.config.litNetwork] !== 'centralised') {
// We don't need to handle node urls changing on centralised networks, since their validator sets are static
try {
if (
CENTRALISATION_BY_NETWORK[this.config.litNetwork] !== 'centralised'
) {
log(
'State found to be new validator set locked, checking validator set'
);
Expand All @@ -330,39 +328,55 @@ export class LitCore {
const delta: string[] = validatorData.bootstrapUrls.filter((item) =>
existingNodeUrls.includes(item)
);
// if the sets differ we reconnect.

// check if the node sets are non-matching and re-connect if they do not.
if (delta.length > 1) {
// check if the node sets are non-matching and re-connect if they do not.
/*
TODO: This covers *most* cases where a node may come in or out of the active
set which we will need to re attest to the execution environments.
However, the sdk currently does not know if there is an active network operation pending.
Such that the state when the request was sent will now mutate when the response is sent back.
The sdk should be able to understand its current execution environment and wait on an active
network request to the previous epoch's node set before changing over.
*/
TODO: This covers *most* cases where a node may come in or out of the active
set which we will need to re attest to the execution environments.
However, the sdk currently does not know if there is an active network operation pending.
Such that the state when the request was sent will now mutate when the response is sent back.
The sdk should be able to understand its current execution environment and wait on an active
network request to the previous epoch's node set before changing over.
*/
log(
'Active validator sets changed, new validators ',
delta,
'starting node connection'
);
await this.connect(); // Will update `epochInfo`
}

await this.connect();
} catch (err: unknown) {
// FIXME: We should emit an error event so that consumers know that we are de-synced and can connect() again
// But for now, our every-30-second network sync will fix things in at most 30s from now.
// this.ready = false; Should we assume core is invalid if we encountered errors refreshing from an epoch change?
const { message = '' } = err as
| Error
| NodeClientErrorV0
| NodeClientErrorV1;
logError(
'Error while attempting to reconnect to nodes after epoch transition:',
message
} else {
// In case of centralised networks, we don't run `connect()` flow, so we will manually update epochInfo here
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is a centralized network treated differently? The only difference should be in the attestation but the handshake should be invariant to the network?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My understanding is that for centralized networks, the node list doesn't change, so re-handshaking with all the nodes on epoch change would be redundant; if that's not true, we should definitely fix it...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we can treat them the same way as decentralized I would say we do so, specially considering the "centralization" is more like a special thing instead of something really relevant in the network design

Also this way we can test nodes change in local (centralized)

this._epochState = await this._fetchCurrentEpochState(
validatorData.epochInfo
);
}
}
} catch (err: unknown) {
// Ensure that any methods that check `this.ready` throw errors to the caller, and any consumers can check appropriately
this.ready = false;

const { message = '' } = err as
| Error
| NodeClientErrorV0
| NodeClientErrorV1;
logError(
'Error while attempting to reconnect to nodes after epoch transition:',
message
);

// Signal to any listeners that we've encountered a fatal error
this.emit(
'error',
new Error(
'Error while attempting to reconnect to nodes after epoch transition:' +
message
)
);

// Signal to any listeners that we're 'disconnected' from LIT network
this.emit('disconnected', true);
}
}

Expand Down Expand Up @@ -408,6 +422,7 @@ export class LitCore {
this._stopListeningForNewEpoch();
// this._stopNetworkPolling();
if (globalThis.litConfig) delete globalThis.litConfig;
this.emit('disconnected', true);
}

// _stopNetworkPolling() {
Expand Down Expand Up @@ -492,11 +507,6 @@ export class LitCore {
}

private async _connect() {
// Ensure an ill-timed epoch change event doesn't trigger concurrent config changes while we're already doing that
Copy link
Contributor Author

@MaximusHaximus MaximusHaximus Nov 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed this call to stop listening -- if the code inside our listener calls connect(), it will just chain on any already-pending connect() logic (see connect() logic and promise chain on `this._connectingPromise.

This doesn't make the client self-healing, but it does mean that a failure won't leave the client in a state where it is no longer listening for further epoch change events.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But will it chain the promises since we specifically check that if a pending connection is open then just return it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We always re-set _connectionPromise to null when a connect() call finishes -- so if we are connecting, multiple calls to connect() will all return the same promise. This means that if we're still processing connect() when we receive another epoch change event, it will be effectively a no-op.

It occurred to me that we could implement cancellation across this entire callstack, and then cancel the entire existing call to connect() (along with any pending fetch() calls that it is running), and then run it again from the top. I'd like to implement a much more robust network handling layer based on the v7 branch and land this as-is for now, since it's an improvement over what we've got.

this._stopListeningForNewEpoch();
// Ensure we don't fire an existing network sync poll handler while we're in the midst of connecting anyway
// this._stopNetworkPolling();

// Initialize a contractContext if we were not given one; this allows interaction against the staking contract
// to be handled locally from then on
if (!this.config.contractContext) {
Expand Down Expand Up @@ -531,7 +541,6 @@ export class LitCore {
}

// Re-use staking contract instance from previous connect() executions that succeeded to improve performance
// noinspection ES6MissingAwait - intentionally not `awaiting` so we can run this in parallel below
const validatorData = await this._getValidatorData();

this._stakingContract = validatorData.stakingContract;
Expand Down Expand Up @@ -565,6 +574,8 @@ export class LitCore {
latestBlockhash: this.latestBlockhash,
});

this.emit('connected', true);

// browser only
if (isBrowser()) {
document.dispatchEvent(new Event('lit-ready'));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2203,6 +2203,18 @@ export class LitNodeClientNodeJs

const signatures: SessionSigsMap = {};

if (!this.ready) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay getSessionSig() was the only function missing this.ready check. Actually we check this in the signSessionKey()?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do check it in signSessionKey(), but that method isn't called from getSessionSigs() -- it is only called from the authNeededCallback() defined in getPkpSessionSigs() and in some auth providers -- so I added it here for completeness 👍

// If the client isn't ready, `connectedNodes` may be out-of-date, and we should throw an error immediately
const message =
'[executeJs] LitNodeClient is not ready. Please call await litNodeClient.connect() first.';

throwError({
message,
errorKind: LIT_ERROR.LIT_NODE_CLIENT_NOT_READY_ERROR.kind,
errorCode: LIT_ERROR.LIT_NODE_CLIENT_NOT_READY_ERROR.name,
});
}

this.connectedNodes.forEach((nodeAddress: string) => {
const toSign: SessionSigningTemplate = {
...signingTemplate,
Expand Down
Loading