Skip to content

Commit

Permalink
fix: failed deployments report (#890)
Browse files Browse the repository at this point in the history
* refactor: convert retry failed deployments into component

* fix: ignore fix for failed deployments

* fix: mocked synchronization manager import
  • Loading branch information
guidota authored Feb 1, 2022
1 parent 6105dcf commit 97054c9
Show file tree
Hide file tree
Showing 9 changed files with 162 additions and 84 deletions.
9 changes: 8 additions & 1 deletion content/src/Environment.ts
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ export enum EnvironmentConfig {
CACHE_SIZES,
BLOCKS_L1_SUBGRAPH_URL,
BLOCKS_L2_SUBGRAPH_URL,
VALIDATE_API
VALIDATE_API,
RETRY_FAILED_DEPLOYMENTS_DELAY_TIME
}

export class EnvironmentBuilder {
Expand Down Expand Up @@ -361,6 +362,12 @@ export class EnvironmentBuilder {

this.registerConfigIfNotAlreadySet(env, EnvironmentConfig.VALIDATE_API, () => process.env.VALIDATE_API == 'true')

this.registerConfigIfNotAlreadySet(
env,
EnvironmentConfig.RETRY_FAILED_DEPLOYMENTS_DELAY_TIME,
() => process.env.RETRY_FAILED_DEPLOYMENTS_DELAY_TIME ?? ms('15m')
)

return await initComponentsWithEnv(env)
}

Expand Down
23 changes: 16 additions & 7 deletions content/src/components.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ import { createBatchDeployerComponent } from './service/synchronization/batchDep
import { ChallengeSupervisor } from './service/synchronization/ChallengeSupervisor'
import { DAOClientFactory } from './service/synchronization/clients/DAOClientFactory'
import { ContentCluster } from './service/synchronization/ContentCluster'
import { ClusterSynchronizationManager } from './service/synchronization/SynchronizationManager'
import { createRetryFailedDeployments } from './service/synchronization/retryFailedDeployments'
import { createSynchronizationManager } from './service/synchronization/SynchronizationManager'
import { SystemPropertiesManager } from './service/system-properties/SystemProperties'
import { createServerValidator } from './service/validations/server'
import { createValidator } from './service/validations/validator'
Expand Down Expand Up @@ -167,18 +168,25 @@ export async function initComponentsWithEnv(env: Environment): Promise<AppCompon
}
)

const synchronizationManager = new ClusterSynchronizationManager({
synchronizationJobManager,
downloadQueue,
deployer,
fetcher,
const retryFailedDeployments = createRetryFailedDeployments({
env,
metrics,
staticConfigs,
fetcher,
downloadQueue,
logs,
deployer,
contentCluster,
failedDeploymentsCache
})

const synchronizationManager = createSynchronizationManager({
synchronizationJobManager,
logs,
contentCluster,
retryFailedDeployments
})

const ethNetwork: string = env.getConfig(EnvironmentConfig.ETH_NETWORK)

const controller = new Controller(
Expand Down Expand Up @@ -230,6 +238,7 @@ export async function initComponentsWithEnv(env: Environment): Promise<AppCompon
systemPropertiesManager,
catalystFetcher,
daoClient,
server
server,
retryFailedDeployments
}
}
14 changes: 7 additions & 7 deletions content/src/logic/deployments.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@ export async function retryFailedDeploymentExecution(
| 'contentCluster'
| 'failedDeploymentsCache'
>,
logger: ILoggerComponent.ILogger
logger?: ILoggerComponent.ILogger
): Promise<void> {
const logs = logger || components.logs.getLogger('retryFailedDeploymentExecution')
// Get Failed Deployments from local storage
const failedDeployments: FailedDeployment[] = components.deployer.getAllFailedDeployments()

Expand All @@ -41,7 +42,7 @@ export async function retryFailedDeploymentExecution(
// Build Deployment from other servers
const { entityId, entityType, authChain } = failedDeployment
if (authChain) {
logger.debug(`Will retry to deploy entity`, { entityId, entityType })
logs.debug(`Will retry to deploy entity`, { entityId, entityType })
try {
await deployEntityFromRemoteServer(
components,
Expand All @@ -55,16 +56,15 @@ export async function retryFailedDeploymentExecution(
// it failed again, override failed deployment error description
const errorDescription = error.message + ''

// TODO [mendez] this condition has no test coverage
if (!errorDescription.startsWith(IGNORING_FIX_ERROR)) {
if (!errorDescription.includes(IGNORING_FIX_ERROR)) {
components.failedDeploymentsCache.reportFailure({ ...failedDeployment, errorDescription })
}

logger.error(`Failed to fix deployment of entity`, { entityId, entityType, errorDescription })
logger.error(error)
logs.error(`Failed to fix deployment of entity`, { entityId, entityType, errorDescription })
logs.error(error)
}
} else {
logger.info(`Can't retry failed deployment. Because it lacks of authChain`, { entityId, entityType })
logs.info(`Can't retry failed deployment. Because it lacks of authChain`, { entityId, entityType })
}
}
}
101 changes: 38 additions & 63 deletions content/src/service/synchronization/SynchronizationManager.ts
Original file line number Diff line number Diff line change
@@ -1,20 +1,8 @@
import { delay } from '@catalyst/commons'
import { ILoggerComponent } from '@well-known-components/interfaces'
import ms from 'ms'
import { retryFailedDeploymentExecution } from '../../logic/deployments'
import { AppComponents, IStatusCapableComponent, StatusProbeResult } from '../../types'
import { AppComponents, IStatusCapableComponent } from '../../types'

type ContentSyncComponents = Pick<
AppComponents,
| 'staticConfigs'
| 'logs'
| 'downloadQueue'
| 'metrics'
| 'fetcher'
| 'synchronizationJobManager'
| 'deployer'
| 'contentCluster'
| 'failedDeploymentsCache'
'logs' | 'synchronizationJobManager' | 'contentCluster' | 'retryFailedDeployments'
>

export enum SynchronizationState {
Expand All @@ -23,62 +11,49 @@ export enum SynchronizationState {
SYNCING = 'Syncing'
}

export class ClusterSynchronizationManager implements IStatusCapableComponent {
private static LOGGER: ILoggerComponent.ILogger
export type ISynchronizationManager = IStatusCapableComponent & {
syncWithServers(): Promise<void>
}

private synchronizationState: SynchronizationState = SynchronizationState.BOOTSTRAPPING
export const createSynchronizationManager = (components: ContentSyncComponents): ISynchronizationManager => {
const logger = components.logs.getLogger('ClusterSynchronizationManager')

constructor(public components: ContentSyncComponents) {
ClusterSynchronizationManager.LOGGER = components.logs.getLogger('ClusterSynchronizationManager')
}
let synchronizationState = SynchronizationState.BOOTSTRAPPING

async getComponentStatus(): Promise<StatusProbeResult> {
const clusterStatus = this.components.contentCluster.getStatus()
return {
name: 'synchronizationStatus',
data: {
...clusterStatus,
synchronizationState: this.synchronizationState
return {
getComponentStatus: async () => {
const clusterStatus = components.contentCluster.getStatus()
return {
name: 'synchronizationStatus',
data: {
...clusterStatus,
synchronizationState: synchronizationState
}
}
},
syncWithServers: async () => {
logger.info(`Starting to sync entities from servers pointer changes`)
const setDesiredJobs = () => {
synchronizationState = SynchronizationState.SYNCING
const desiredJobNames = new Set(components.contentCluster.getAllServersInCluster())
// the job names are the contentServerUrl
return components.synchronizationJobManager.setDesiredJobs(desiredJobNames)
}
}
}

// This is the method that is called to sync with other catalysts
async syncWithServers(): Promise<void> {
ClusterSynchronizationManager.LOGGER.info(`Starting to sync entities from servers pointer changes`)
const setDesiredJobs = () => {
this.synchronizationState = SynchronizationState.SYNCING
const desiredJobNames = new Set(this.components.contentCluster.getAllServersInCluster())
// the job names are the contentServerUrl
return this.components.synchronizationJobManager.setDesiredJobs(desiredJobNames)
}

// start the sync jobs
setDesiredJobs()

// setDesiredJobs every time we synchronize the DAO servers, this is an asynchronous job.
// the setDesiredJobs function handles the lifecycle od those async jobs.
this.components.contentCluster.onSyncFinished(() => {
this.synchronizationState = SynchronizationState.SYNCED
// start the sync jobs
setDesiredJobs()
})

// Configure retry for failed deployments
this.retryFailedDeployments().catch(() => {
ClusterSynchronizationManager.LOGGER.error('There was an error during the retry of failed deployments.')
})
}

// TODO: [wkc] make this a CronJob stoppable component
private async retryFailedDeployments(): Promise<void> {
while (true) {
// TODO: [new-sync] Make this configurable
await delay(ms('15m'))
try {
await retryFailedDeploymentExecution(this.components, ClusterSynchronizationManager.LOGGER)
} catch (err: any) {
ClusterSynchronizationManager.LOGGER.error(err)
}
// setDesiredJobs every time we synchronize the DAO servers, this is an asynchronous job.
// the setDesiredJobs function handles the lifecycle od those async jobs.
components.contentCluster.onSyncFinished(() => {
synchronizationState = SynchronizationState.SYNCED
setDesiredJobs()
})

// Configure retry for failed deployments
components.retryFailedDeployments.schedule().catch(() => {
logger.error('There was an error during the retry of failed deployments.')
})
}
}
}
57 changes: 57 additions & 0 deletions content/src/service/synchronization/retryFailedDeployments.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import { IBaseComponent } from '@well-known-components/interfaces'
import { EnvironmentConfig } from '../../Environment'
import { retryFailedDeploymentExecution } from '../../logic/deployments'
import { AppComponents } from '../../types'

export type IRetryFailedDeploymentsComponent = IBaseComponent & {
schedule: () => Promise<void>
}
/**
* This component schedules the retry of failed deployments.
*/
export const createRetryFailedDeployments = (
components: Pick<
AppComponents,
| 'env'
| 'metrics'
| 'staticConfigs'
| 'fetcher'
| 'downloadQueue'
| 'logs'
| 'deployer'
| 'contentCluster'
| 'failedDeploymentsCache'
>
): IRetryFailedDeploymentsComponent => {
const retryDelay = components.env.getConfig<number>(EnvironmentConfig.RETRY_FAILED_DEPLOYMENTS_DELAY_TIME)
const logger = components.logs.getLogger('RetryFailedDeployments')
let timeoutId: NodeJS.Timeout | undefined
let running = false
return {
start: async () => {
running = true
logger.debug('Starting retry failed deployments')
},
stop: async () => {
running = false
if (timeoutId) {
clearTimeout(timeoutId)
}
logger.debug('Stopping retry failed deployments')
},
schedule: async () => {
while (running) {
await new Promise((resolve) => {
timeoutId = setTimeout(resolve, retryDelay)
})
if (!running) return
try {
logger.debug('Executing retry failed deployments')
await retryFailedDeploymentExecution(components, logger)
} catch (err: any) {
logger.error(err)
}
}
}
}
}
6 changes: 4 additions & 2 deletions content/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ import { MetaverseContentService } from './service/Service'
import { ISnapshotManager } from './service/snapshots/SnapshotManager'
import { IChallengeSupervisor } from './service/synchronization/ChallengeSupervisor'
import { ContentCluster } from './service/synchronization/ContentCluster'
import { ClusterSynchronizationManager } from './service/synchronization/SynchronizationManager'
import { IRetryFailedDeploymentsComponent } from './service/synchronization/retryFailedDeployments'
import { ISynchronizationManager } from './service/synchronization/SynchronizationManager'
import { SystemPropertiesManager } from './service/system-properties/SystemProperties'
import { ServerValidator } from './service/validations/server'
import { ContentStorage } from './storage/ContentStorage'
Expand All @@ -44,8 +45,8 @@ export type AppComponents = {
}
batchDeployer: IDeployerComponent
synchronizationJobManager: JobLifecycleManagerComponent
synchronizationManager: ISynchronizationManager
deployedEntitiesFilter: DeploymentListComponent
synchronizationManager: ClusterSynchronizationManager
controller: Controller
snapshotManager: ISnapshotManager
denylist: Denylist
Expand All @@ -64,6 +65,7 @@ export type AppComponents = {
catalystFetcher: Fetcher
daoClient: DAOClient
server: Server
retryFailedDeployments: IRetryFailedDeploymentsComponent

// this will be replaced by `database` and removed from here
repository: Repository
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { ClusterSynchronizationManager } from '../../../../src/service/synchronization/SynchronizationManager'
import { ISynchronizationManager } from '../../../../src/service/synchronization/SynchronizationManager'

export function makeNoopSynchronizationManager(component: ClusterSynchronizationManager) {
export function makeNoopSynchronizationManager(component: ISynchronizationManager) {
jest.spyOn(component, 'syncWithServers').mockResolvedValue()
}
4 changes: 2 additions & 2 deletions content/test/integration/E2ETestEnvironment.ts
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ export function loadTestEnvironment(
describe(name, () => {
const testEnv = new E2ETestEnvironment()

it('starts the test environment', async () => {
beforeEach(async () => {
await testEnv.start(overrideConfigs)
})

Expand All @@ -220,7 +220,7 @@ export function loadTestEnvironment(
test(testEnv)
})

it('stops the test environment', async () => {
afterEach(async () => {
await testEnv.stop()
})
})
Expand Down
28 changes: 28 additions & 0 deletions content/test/integration/syncronization/failed-deployments.spec.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { createIdentity } from 'eth-crypto'
import { stub } from 'sinon'
import { EnvironmentConfig } from '../../../src/Environment'
import { retryFailedDeploymentExecution } from '../../../src/logic/deployments'
import { FailedDeployment, FailureReason } from '../../../src/ports/failedDeploymentsCache'
import { assertDeploymentFailed, assertDeploymentFailsWith, assertEntitiesAreActiveOnServer } from '../E2EAssertions'
import { loadTestEnvironment } from '../E2ETestEnvironment'
Expand Down Expand Up @@ -109,6 +110,33 @@ loadTestEnvironment()('Errors during sync', (testEnv) => {
})
})
})

describe('ignore to fix the failed deployment when there are newer entities', function () {
beforeEach(async function () {
// Deploy a new entity for the same pointer
this.anotherEntityCombo = await buildDeployDataAfterEntity(this.controllerEntity, ['0,1'], {
metadata: 'metadata2'
})
// Deploy entity 2 on server 2
await this.server2.deploy(this.anotherEntityCombo.deployData)
await awaitUntil(() => assertEntitiesAreActiveOnServer(this.server2, this.anotherEntityCombo.controllerEntity))
await awaitUntil(() =>
assertDeploymentFailed(this.server2, FailureReason.DEPLOYMENT_ERROR, this.controllerEntity)
)

// Restore server validations to detect the newer entity
this.serverValidatorStub2.restore()

await retryFailedDeploymentExecution(this.server2.components)
})

it('is removed from failed', async function () {
await awaitUntil(async () => {
const newFailedDeployments: FailedDeployment[] = await this.server2.getFailedDeployments()
expect(newFailedDeployments.length).toBe(0)
})
})
})
})

describe('Deploy as fix a not failed entity', function () {
Expand Down

0 comments on commit 97054c9

Please sign in to comment.