Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Maximum Retry Limit for Firmware Upgrades #109

Open
wants to merge 13 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docker-compose-addons.yml
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ services:
BLOB_STORAGE_PROVIDER: ${BLOB_STORAGE_PROVIDER}
BLOB_STORAGE_BUCKET: ${BLOB_STORAGE_BUCKET}

FW_UPGRADE_MAX_RETRY_LIMIT: ${FW_UPGRADE_MAX_RETRY_LIMIT}

GCP_PROJECT: ${GCP_PROJECT_ID}
GOOGLE_APPLICATION_CREDENTIALS: '/google/gcp_credentials.json'

Expand Down
24 changes: 23 additions & 1 deletion resources/sql_scripts/CVManager_CreateTables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -459,4 +459,26 @@ CREATE TABLE IF NOT EXISTS public.obu_ota_requests (
CONSTRAINT fk_manufacturer FOREIGN KEY (manufacturer) REFERENCES public.manufacturers(manufacturer_id)
);

CREATE SCHEMA IF NOT EXISTS keycloak;
CREATE SCHEMA IF NOT EXISTS keycloak;

CREATE TABLE IF NOT EXISTS public.consecutive_firmware_upgrade_failures
(
rsu_id integer NOT NULL,
consecutive_failures integer NOT NULL,
CONSTRAINT consecutive_firmware_upgrade_failures_pkey PRIMARY KEY (rsu_id),
CONSTRAINT fk_rsu_id FOREIGN KEY (rsu_id)
REFERENCES public.rsus (rsu_id) MATCH SIMPLE
ON UPDATE NO ACTION
ON DELETE NO ACTION
);

CREATE TABLE IF NOT EXISTS public.max_retry_limit_reached_instances
(
rsu_id integer NOT NULL,
reached_at timestamp without time zone NOT NULL,
CONSTRAINT max_retry_limit_reached_instances_pkey PRIMARY KEY (rsu_id, reached_at),
CONSTRAINT fk_rsu_id FOREIGN KEY (rsu_id)
REFERENCES public.rsus (rsu_id) MATCH SIMPLE
ON UPDATE NO ACTION
ON DELETE NO ACTION
);
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
-- Run this SQL update script if you already have a deployed CV Manager PostgreSQL database

CREATE TABLE IF NOT EXISTS public.consecutive_firmware_upgrade_failures
(
rsu_id integer NOT NULL,
consecutive_failures integer NOT NULL,
CONSTRAINT consecutive_firmware_upgrade_failures_pkey PRIMARY KEY (rsu_id),
CONSTRAINT fk_rsu_id FOREIGN KEY (rsu_id)
REFERENCES public.rsus (rsu_id) MATCH SIMPLE
ON UPDATE NO ACTION
ON DELETE NO ACTION
);

CREATE TABLE IF NOT EXISTS public.max_retry_limit_reached_instances
(
rsu_id integer NOT NULL,
reached_at timestamp without time zone NOT NULL,
CONSTRAINT max_retry_limit_reached_instances_pkey PRIMARY KEY (rsu_id, reached_at),
CONSTRAINT fk_rsu_id FOREIGN KEY (rsu_id)
REFERENCES public.rsus (rsu_id) MATCH SIMPLE
ON UPDATE NO ACTION
ON DELETE NO ACTION
);
dmccoystephenson marked this conversation as resolved.
Show resolved Hide resolved
2 changes: 2 additions & 0 deletions sample.env
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@ BLOB_STORAGE_PROVIDER=DOCKER
BLOB_STORAGE_BUCKET=
## Docker volume mount point for BLOB storage (if using Docker)
HOST_BLOB_STORAGE_DIRECTORY=./local_blob_storage
## Maximum retry limit for performing firmware upgrades
FW_UPGRADE_MAX_RETRY_LIMIT=3
# ---------------------------------------------------------------------

# Geo-spatial message query Addon:
Expand Down
94 changes: 91 additions & 3 deletions services/addons/images/firmware_manager/firmware_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,17 @@
upgrade_queue_info = {}
active_upgrades_lock = Lock()


# Changed from a constant to a function to help with unit testing
def get_upgrade_limit() -> int:
try:
upgrade_limit = int(os.environ.get("ACTIVE_UPGRADE_LIMIT", "1"))
return upgrade_limit
except ValueError:
raise ValueError("The environment variable 'ACTIVE_UPGRADE_LIMIT' must be an integer.")
raise ValueError(
"The environment variable 'ACTIVE_UPGRADE_LIMIT' must be an integer."
)


# Function to query the CV Manager PostgreSQL database for RSUs that have:
# - A different target version than their current version
Expand Down Expand Up @@ -129,6 +133,17 @@ def init_firmware_upgrade():
500,
)

# Check if latest ping was unsuccessful
if not was_latest_ping_successful_for_rsu(request_args["rsu_ip"]):
return (
jsonify(
{
"error": f"Firmware upgrade failed to start for '{request_args['rsu_ip']}': device is unreachable"
}
),
500,
)

# Pull RSU data from the PostgreSQL database
logging.info(f"Querying RSU data for '{request_args['rsu_ip']}'")
rsu_to_upgrade = get_rsu_upgrade_data(request_args["rsu_ip"])
Expand Down Expand Up @@ -197,6 +212,7 @@ def firmware_upgrade_completed():

# Update RSU firmware_version in PostgreSQL if the upgrade was successful
if request_args["status"] == "success":
reset_consecutive_failure_count_for_rsu(request_args["rsu_ip"])
try:
upgrade_info = active_upgrades[request_args["rsu_ip"]]
query = f"UPDATE public.rsus SET firmware_version={upgrade_info['target_firmware_id']} WHERE ipv4_address='{request_args['rsu_ip']}'"
Expand All @@ -213,6 +229,22 @@ def firmware_upgrade_completed():
),
500,
)
else:
increment_consecutive_failure_count_for_rsu(request_args["rsu_ip"])
if is_rsu_at_max_retries_limit(request_args["rsu_ip"]):
logging.error(
f"RSU {request_args['rsu_ip']} has reached the maximum number of upgrade retries. Setting target_firmware_version to firmware_version and resetting consecutive failures count."
)

# set target_firmware_version to firmware_version value
query = f"UPDATE public.rsus SET target_firmware_version=firmware_version WHERE ipv4_address='{request_args['rsu_ip']}'"
pgquery.write_db(query)

log_max_retries_reached_incident_for_rsu_to_postgres(
request_args["rsu_ip"]
)

reset_consecutive_failure_count_for_rsu(request_args["rsu_ip"])

# Remove firmware upgrade from active upgrades
logging.info(
Expand Down Expand Up @@ -240,7 +272,15 @@ def list_active_upgrades():
"target_firmware_version": value["target_firmware_version"],
"install_package": value["install_package"],
}
return jsonify({"active_upgrades": sanitized_active_upgrades, "upgrade_queue": list(upgrade_queue)}), 200
return (
jsonify(
{
"active_upgrades": sanitized_active_upgrades,
"upgrade_queue": list(upgrade_queue),
}
),
200,
)


# Scheduled firmware upgrade checker
Expand All @@ -259,18 +299,66 @@ def check_for_upgrades():
):
continue

# Check if latest ping was unsuccessful
if not was_latest_ping_successful_for_rsu(rsu["ipv4_address"]):
logging.info(
f"Skipping firmware upgrade for '{rsu['ipv4_address']}': device is unreachable"
)
continue

# Add the RSU to the upgrade queue and record the necessary upgrade information
logging.info(
f"Adding '{rsu["ipv4_address"]}' to the firmware manager upgrade queue"
)
upgrade_queue.extend([rsu["ipv4_address"]])
upgrade_queue_info[rsu["ipv4_address"]] = rsu
logging.info(f"Firmware upgrade successfully started for '{rsu["ipv4_address"]}'")
logging.info(
f"Firmware upgrade successfully started for '{rsu["ipv4_address"]}'"
)

# Start any processes that can be started
start_tasks_from_queue()


def was_latest_ping_successful_for_rsu(rsu_ip):
query = f"select result from ping where rsu_id=(select rsu_id from rsus where ipv4_address='{rsu_ip}') order by timestamp desc limit 1"
query_result = pgquery.query_db(query)
if len(query_result) == 0 or len(query_result[0]) == 0:
# no ping results have been recorded for this RSU
return False
latest_ping_successful = query_result[0][0]
logging.info(f"Latest ping result for '{rsu_ip}': {latest_ping_successful}")
return latest_ping_successful


def increment_consecutive_failure_count_for_rsu(rsu_ip):
upsert_query = f"insert into consecutive_firmware_upgrade_failures (rsu_id, consecutive_failures) values ((select rsu_id from rsus where ipv4_address='{rsu_ip}'), 1) on conflict (rsu_id) do update set consecutive_failures=consecutive_firmware_upgrade_failures.consecutive_failures+1"
pgquery.write_db(upsert_query)


def reset_consecutive_failure_count_for_rsu(rsu_ip):
upsert_query = f"insert into consecutive_firmware_upgrade_failures (rsu_id, consecutive_failures) values ((select rsu_id from rsus where ipv4_address='{rsu_ip}'), 0) on conflict (rsu_id) do update set consecutive_failures=0"
pgquery.write_db(upsert_query)


def is_rsu_at_max_retries_limit(rsu_ip):
max_retries = int(os.environ.get("FW_UPGRADE_MAX_RETRY_LIMIT", "3"))
query_result = pgquery.query_db(
f"select consecutive_failures from consecutive_firmware_upgrade_failures where rsu_id=(select rsu_id from rsus where ipv4_address='{rsu_ip}')"
)
if len(query_result) == 0 or len(query_result[0]) == 0:
# no failures have been recorded for this RSU, so it cannot be at the limit
return False
consecutive_failures = query_result[0][0]
return consecutive_failures >= max_retries


def log_max_retries_reached_incident_for_rsu_to_postgres(rsu_ip):
pgquery.write_db(
f"insert into max_retry_limit_reached_instances (rsu_id, reached_at) values ((select rsu_id from rsus where ipv4_address='{rsu_ip}'), now())"
)


def serve_rest_api():
# Run Flask app for manually initiated firmware upgrades
logging.info("Initiating Firmware Manager REST API...")
Expand Down
3 changes: 3 additions & 0 deletions services/addons/images/firmware_manager/sample.env
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ BLOB_STORAGE_BUCKET=
## Docker volume mount point for BLOB storage (if using DOCKER)
HOST_BLOB_STORAGE_DIRECTORY=./local_blob_storage

## Maximum retry limit for performing firmware upgrades
FW_UPGRADE_MAX_RETRY_LIMIT=3

# For users using GCP cloud storage
GCP_PROJECT=""
GOOGLE_APPLICATION_CREDENTIALS=""
Expand Down
Loading
Loading