Skip to content

Commit

Permalink
fix: when SendData callback times out continuously, mark node as dead…
Browse files Browse the repository at this point in the history
… and do not restart driver (#6403)
  • Loading branch information
AlCalzone authored Oct 16, 2023
1 parent 9818c58 commit 15ed348
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 39 deletions.
152 changes: 123 additions & 29 deletions packages/zwave-js/src/lib/driver/Driver.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3493,40 +3493,143 @@ export class Driver extends TypedEventEmitter<DriverEventCallbacks>
}
}

private handleUnresponsiveController(
/**
* @internal
* Handles the case that the controller didn't acknowledge a command in time
* Returns `true` if the transaction failure was handled, `false` if it needs to be rejected.
*/
private handleMissingControllerACK(
transaction: Transaction,
error: ZWaveError,
error: ZWaveError & {
code: ZWaveErrorCodes.Controller_Timeout;
context: "ACK";
},
): boolean {
if (!this._controller) return false;

const fail = (message: string) => {
this.rejectTransaction(transaction, error);

void this.destroyWithMessage(message);
};

if (this._controller.status !== ControllerStatus.Unresponsive) {
// The controller was responsive before this transaction failed.
// Mark it as unresponsive and try to soft-reset it.
this.controller.setStatus(
ControllerStatus.Unresponsive,
);

this._recoveryPhase = ControllerRecoveryPhase.NoACK;

this.driverLog.print(
"Attempting to recover unresponsive controller...",
"warn",
);

// Re-queue the transaction.
// Its message generator may have finished, so reset that too.
transaction.reset();
this.queue.add(transaction.clone());

// Execute the soft-reset asynchronously
void this.softReset().then(() => {
// The controller responded. It is no longer unresponsive
this._controller?.setStatus(ControllerStatus.Ready);
this._recoveryPhase = ControllerRecoveryPhase.None;
}).catch(() => {
// Soft-reset failed. Reject the original transaction and try restarting the driver.
fail(
"Recovering unresponsive controller failed. Restarting the driver...",
);
});

return true;
} else {
// We already attempted to recover from an unresponsive controller.
// Ending up here means the soft-reset also failed and the driver is about to be destroyed
return false;
}
}

/**
* @internal
* Handles the case that the controller didn't send the callback for a SendData in time
* Returns `true` if the transaction failure was handled, `false` if it needs to be rejected.
*/
private handleMissingSendDataCallback(
transaction: Transaction,
error: ZWaveError & {
code: ZWaveErrorCodes.Controller_Timeout;
context: "callback";
},
): boolean {
if (!this._controller) return false;

if (
this._recoveryPhase
=== ControllerRecoveryPhase.CallbackTimeoutAfterReset
) {
const node = this.getNodeUnsafe(transaction.message);
if (!node) return false; // This should never happen, but whatever

// The controller is still timing out transmitting after a soft reset, don't try again.
// Reject the transaction and destroy the driver.
fail("Controller is still timing out. Restarting the driver...");
// Real-world experience has shown that for older controllers this situation can be caused by unresponsive nodes.

return true;
// The following is essentially a copy of handleMissingNodeACK, but with updated error messages
const messagePart1 =
"The node is causing the controller to become unresponsive";

if (node.canSleep) {
if (node.status === NodeStatus.Asleep) {
// We already moved the messages to the wakeup queue before. If we end up here, this means a command
// was sent that may be sent to potentially asleep nodes - including pings.
return false;
}
this.controllerLog.logNode(
node.id,
`${messagePart1}. It is probably asleep, moving its messages to the wakeup queue.`,
"warn",
);

// There is no longer a reference to the current transaction. If it should be moved to the wakeup queue,
// it temporarily needs to be added to the queue again.
const handled = this.mayMoveToWakeupQueue(transaction);
if (handled) {
this.queue.add(transaction);
}

// Mark the node as asleep. This will move the messages to the wakeup queue
node.markAsAsleep();

return handled;
} else {
const errorMsg = `${messagePart1}, it is presumed dead`;
this.controllerLog.logNode(node.id, errorMsg, "warn");

node.markAsDead();

// There is no longer a reference to the current transaction on the queue, so we need to reject it separately.
transaction.setProgress({
state: TransactionState.Failed,
reason: errorMsg,
});

transaction.abort(error);
this.rejectAllTransactionsForNode(node.id, errorMsg);

return true;
}
} else if (this._controller.status !== ControllerStatus.Unresponsive) {
// The controller was responsive before this transaction failed.
// Mark it as unresponsive and try to soft-reset it.
this.controller.setStatus(
ControllerStatus.Unresponsive,
);

this._recoveryPhase = isMissingControllerACK(error)
? ControllerRecoveryPhase.NoACK
: ControllerRecoveryPhase.CallbackTimeout;
this._recoveryPhase = ControllerRecoveryPhase.CallbackTimeout;

this.driverLog.print(
"Attempting to recover unresponsive controller...",
"Controller missed Send Data callback. Attempting to recover...",
"warn",
);

Expand All @@ -3540,26 +3643,16 @@ export class Driver extends TypedEventEmitter<DriverEventCallbacks>
// The controller responded. It is no longer unresponsive
this._controller?.setStatus(ControllerStatus.Ready);

if (this._recoveryPhase === ControllerRecoveryPhase.NoACK) {
this._recoveryPhase = ControllerRecoveryPhase.None;
} else if (
this._recoveryPhase
=== ControllerRecoveryPhase.CallbackTimeout
) {
this._recoveryPhase =
ControllerRecoveryPhase.CallbackTimeoutAfterReset;
}
this._recoveryPhase =
ControllerRecoveryPhase.CallbackTimeoutAfterReset;
}).catch(() => {
// Soft-reset failed. Reject the original transaction and try restarting the driver.
fail(
"Recovering unresponsive controller failed. Restarting the driver...",
);
// Soft-reset failed. Just reject the original transaction.
this.rejectTransaction(transaction, error);
});

return true;
} else {
// We already attempted to recover from an unresponsive controller.
// Ending up here means the soft-reset also failed and the driver is about to be destroyed
// Not sure what to do here
return false;
}
}
Expand Down Expand Up @@ -5566,12 +5659,13 @@ ${handlers.length} left`,
// If a node failed to respond in time, it might be sleeping
if (this.isMissingNodeACK(transaction, error)) {
if (this.handleMissingNodeACK(transaction as any, error)) return;
} else if (isMissingControllerACK(error)) {
if (this.handleMissingControllerACK(transaction, error)) return;
} else if (
isMissingControllerACK(error)
|| (isSendData(transaction.message)
&& isMissingControllerCallback(error))
isSendData(transaction.message)
&& isMissingControllerCallback(error)
) {
if (this.handleUnresponsiveController(transaction, error)) return;
if (this.handleMissingSendDataCallback(transaction, error)) return;
}

this.rejectTransaction(transaction, error);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import {
MockControllerStateKeys,
} from "../../controller/MockControllerState";

import { ZWaveErrorCodes, assertZWaveError } from "@zwave-js/core";
import { NodeStatus, ZWaveErrorCodes, assertZWaveError } from "@zwave-js/core";
import Sinon from "sinon";
import { SoftResetRequest } from "../../serialapi/misc/SoftResetRequest";
import {
Expand Down Expand Up @@ -90,7 +90,8 @@ integrationTest(
// Soft reset should restore normal operation
if (msg instanceof SoftResetRequest) {
shouldTimeOut = false;
return true;
// Delegate to the default behavior
return false;
}
},
};
Expand Down Expand Up @@ -124,8 +125,10 @@ integrationTest(
);

integrationTest(
"Destroy driver if SendData is still missing the callback after soft-reset",
"Mark node as dead if SendData is still missing the callback after soft-reset",
{
// Real-world experience has shown that for older controllers this situation can be caused by dead nodes
// We don't want to restart the driver in that case, but mark the node as dead instead
// debug: true,

// provisioningDirectory: path.join(
Expand Down Expand Up @@ -206,14 +209,14 @@ integrationTest(
(msg) => msg.functionType === FunctionType.SoftReset,
);

// The ping should eventually fail
// The ping should eventually fail and the node be marked dead
t.false(await pingPromise);

// The driver should have been destroyed
await wait(100);
assertZWaveError(t, errorSpy.getCall(0).args[0], {
errorCode: ZWaveErrorCodes.Driver_Failed,
});
t.is(node.status, NodeStatus.Dead);

// The error event should not have been emitted
await wait(300);
t.is(errorSpy.callCount, 0);
},
},
);
Expand Down Expand Up @@ -286,7 +289,8 @@ integrationTest(
// Soft reset should restore normal operation
if (msg instanceof SoftResetRequest) {
shouldTimeOut = false;
return true;
// Delegate to the default behavior
return false;
}
},
};
Expand Down

0 comments on commit 15ed348

Please sign in to comment.