Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: when SendData callback times out continuously, mark node as dead and do not restart driver #6403

Merged
merged 1 commit into from
Oct 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 123 additions & 29 deletions packages/zwave-js/src/lib/driver/Driver.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3493,40 +3493,143 @@ export class Driver extends TypedEventEmitter<DriverEventCallbacks>
}
}

private handleUnresponsiveController(
/**
* @internal
* Handles the case that the controller didn't acknowledge a command in time
* Returns `true` if the transaction failure was handled, `false` if it needs to be rejected.
*/
private handleMissingControllerACK(
transaction: Transaction,
error: ZWaveError,
error: ZWaveError & {
code: ZWaveErrorCodes.Controller_Timeout;
context: "ACK";
},
): boolean {
if (!this._controller) return false;

const fail = (message: string) => {
this.rejectTransaction(transaction, error);

void this.destroyWithMessage(message);
};

if (this._controller.status !== ControllerStatus.Unresponsive) {
// The controller was responsive before this transaction failed.
// Mark it as unresponsive and try to soft-reset it.
this.controller.setStatus(
ControllerStatus.Unresponsive,
);

this._recoveryPhase = ControllerRecoveryPhase.NoACK;

this.driverLog.print(
"Attempting to recover unresponsive controller...",
"warn",
);

// Re-queue the transaction.
// Its message generator may have finished, so reset that too.
transaction.reset();
this.queue.add(transaction.clone());

// Execute the soft-reset asynchronously
void this.softReset().then(() => {
// The controller responded. It is no longer unresponsive
this._controller?.setStatus(ControllerStatus.Ready);
this._recoveryPhase = ControllerRecoveryPhase.None;
}).catch(() => {
// Soft-reset failed. Reject the original transaction and try restarting the driver.
fail(
"Recovering unresponsive controller failed. Restarting the driver...",
);
});

return true;
} else {
// We already attempted to recover from an unresponsive controller.
// Ending up here means the soft-reset also failed and the driver is about to be destroyed
return false;
}
}

/**
* @internal
* Handles the case that the controller didn't send the callback for a SendData in time
* Returns `true` if the transaction failure was handled, `false` if it needs to be rejected.
*/
private handleMissingSendDataCallback(
transaction: Transaction,
error: ZWaveError & {
code: ZWaveErrorCodes.Controller_Timeout;
context: "callback";
},
): boolean {
if (!this._controller) return false;

if (
this._recoveryPhase
=== ControllerRecoveryPhase.CallbackTimeoutAfterReset
) {
const node = this.getNodeUnsafe(transaction.message);
if (!node) return false; // This should never happen, but whatever

// The controller is still timing out transmitting after a soft reset, don't try again.
// Reject the transaction and destroy the driver.
fail("Controller is still timing out. Restarting the driver...");
// Real-world experience has shown that for older controllers this situation can be caused by unresponsive nodes.

return true;
// The following is essentially a copy of handleMissingNodeACK, but with updated error messages
const messagePart1 =
"The node is causing the controller to become unresponsive";

if (node.canSleep) {
if (node.status === NodeStatus.Asleep) {
// We already moved the messages to the wakeup queue before. If we end up here, this means a command
// was sent that may be sent to potentially asleep nodes - including pings.
return false;
}
this.controllerLog.logNode(
node.id,
`${messagePart1}. It is probably asleep, moving its messages to the wakeup queue.`,
"warn",
);

// There is no longer a reference to the current transaction. If it should be moved to the wakeup queue,
// it temporarily needs to be added to the queue again.
const handled = this.mayMoveToWakeupQueue(transaction);
if (handled) {
this.queue.add(transaction);
}

// Mark the node as asleep. This will move the messages to the wakeup queue
node.markAsAsleep();

return handled;
} else {
const errorMsg = `${messagePart1}, it is presumed dead`;
this.controllerLog.logNode(node.id, errorMsg, "warn");

node.markAsDead();

// There is no longer a reference to the current transaction on the queue, so we need to reject it separately.
transaction.setProgress({
state: TransactionState.Failed,
reason: errorMsg,
});

transaction.abort(error);
this.rejectAllTransactionsForNode(node.id, errorMsg);

return true;
}
} else if (this._controller.status !== ControllerStatus.Unresponsive) {
// The controller was responsive before this transaction failed.
// Mark it as unresponsive and try to soft-reset it.
this.controller.setStatus(
ControllerStatus.Unresponsive,
);

this._recoveryPhase = isMissingControllerACK(error)
? ControllerRecoveryPhase.NoACK
: ControllerRecoveryPhase.CallbackTimeout;
this._recoveryPhase = ControllerRecoveryPhase.CallbackTimeout;

this.driverLog.print(
"Attempting to recover unresponsive controller...",
"Controller missed Send Data callback. Attempting to recover...",
"warn",
);

Expand All @@ -3540,26 +3643,16 @@ export class Driver extends TypedEventEmitter<DriverEventCallbacks>
// The controller responded. It is no longer unresponsive
this._controller?.setStatus(ControllerStatus.Ready);

if (this._recoveryPhase === ControllerRecoveryPhase.NoACK) {
this._recoveryPhase = ControllerRecoveryPhase.None;
} else if (
this._recoveryPhase
=== ControllerRecoveryPhase.CallbackTimeout
) {
this._recoveryPhase =
ControllerRecoveryPhase.CallbackTimeoutAfterReset;
}
this._recoveryPhase =
ControllerRecoveryPhase.CallbackTimeoutAfterReset;
}).catch(() => {
// Soft-reset failed. Reject the original transaction and try restarting the driver.
fail(
"Recovering unresponsive controller failed. Restarting the driver...",
);
// Soft-reset failed. Just reject the original transaction.
this.rejectTransaction(transaction, error);
});

return true;
} else {
// We already attempted to recover from an unresponsive controller.
// Ending up here means the soft-reset also failed and the driver is about to be destroyed
// Not sure what to do here
return false;
}
}
Expand Down Expand Up @@ -5566,12 +5659,13 @@ ${handlers.length} left`,
// If a node failed to respond in time, it might be sleeping
if (this.isMissingNodeACK(transaction, error)) {
if (this.handleMissingNodeACK(transaction as any, error)) return;
} else if (isMissingControllerACK(error)) {
if (this.handleMissingControllerACK(transaction, error)) return;
} else if (
isMissingControllerACK(error)
|| (isSendData(transaction.message)
&& isMissingControllerCallback(error))
isSendData(transaction.message)
&& isMissingControllerCallback(error)
) {
if (this.handleUnresponsiveController(transaction, error)) return;
if (this.handleMissingSendDataCallback(transaction, error)) return;
}

this.rejectTransaction(transaction, error);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import {
MockControllerStateKeys,
} from "../../controller/MockControllerState";

import { ZWaveErrorCodes, assertZWaveError } from "@zwave-js/core";
import { NodeStatus, ZWaveErrorCodes, assertZWaveError } from "@zwave-js/core";
import Sinon from "sinon";
import { SoftResetRequest } from "../../serialapi/misc/SoftResetRequest";
import {
Expand Down Expand Up @@ -90,7 +90,8 @@ integrationTest(
// Soft reset should restore normal operation
if (msg instanceof SoftResetRequest) {
shouldTimeOut = false;
return true;
// Delegate to the default behavior
return false;
}
},
};
Expand Down Expand Up @@ -124,8 +125,10 @@ integrationTest(
);

integrationTest(
"Destroy driver if SendData is still missing the callback after soft-reset",
"Mark node as dead if SendData is still missing the callback after soft-reset",
{
// Real-world experience has shown that for older controllers this situation can be caused by dead nodes
// We don't want to restart the driver in that case, but mark the node as dead instead
// debug: true,

// provisioningDirectory: path.join(
Expand Down Expand Up @@ -206,14 +209,14 @@ integrationTest(
(msg) => msg.functionType === FunctionType.SoftReset,
);

// The ping should eventually fail
// The ping should eventually fail and the node be marked dead
t.false(await pingPromise);

// The driver should have been destroyed
await wait(100);
assertZWaveError(t, errorSpy.getCall(0).args[0], {
errorCode: ZWaveErrorCodes.Driver_Failed,
});
t.is(node.status, NodeStatus.Dead);

// The error event should not have been emitted
await wait(300);
t.is(errorSpy.callCount, 0);
},
},
);
Expand Down Expand Up @@ -286,7 +289,8 @@ integrationTest(
// Soft reset should restore normal operation
if (msg instanceof SoftResetRequest) {
shouldTimeOut = false;
return true;
// Delegate to the default behavior
return false;
}
},
};
Expand Down