Skip to content

Commit

Permalink
Fault tolerance (#82)
Browse files Browse the repository at this point in the history
* Add fault tolerance in Area and Video processing.

* Include docker healthcheck in containers.

* Change start_services.bash with supervisord.

* Use supervisorctl in healthcheck.

* Removing empty line in dockerfile. Fixing comment.

* Update libs/area_reporting.py

Co-authored-by: Renzo Gambone <42361379+renzodgc@users.noreply.github.com>

* Include max_retries in threads restarts.

* Add MaxThreadRestarts parameter in the config files.

* Reset the restarts counter when last restart is previous than 1 minute ago.

Co-authored-by: Renzo Gambone <42361379+renzodgc@users.noreply.github.com>
  • Loading branch information
pgrill and renzodgc authored Nov 20, 2020
1 parent 01b6cb2 commit bf82aad
Show file tree
Hide file tree
Showing 21 changed files with 140 additions and 60 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,8 @@ All the configurations are grouped in *sections* and some of them can vary depen
- `DashboardURL`: Sets the url where the frontend is running. Unless you are using a custom domain, you should keep this value as https://beta.lanthorn.ai/.
- `EnableSlackNotifications`: A boolean parameter to enable/disable the Slack integration for notifications and daily reports. We recommend not editing this parameter directly and manage it from the [UI](https://beta.lanthorn.ai) to configure your workspace correctly.
- `SlackChannel`: Configures the slack channel used by the notifications. The chosen slack channel must exist in the configured workspace.
- `OccupancyAlertsMinInterval`: Sets the desired interval (in seconds) between occupancy alerts.
- `MaxThreadRestarts`: Defines the number of restarts allowed per thread.

- `[Api]`
- `Host`: Configures the host IP of the processor's API (inside docker). We recommend don't change that value and keep it as *0.0.0.0*.
Expand Down
6 changes: 4 additions & 2 deletions amd64-usbtpu.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3-requests \
build-essential \
libedgetpu1-std \
supervisor \
&& rm -rf /var/lib/apt/lists/* \
&& python3 -m pip install --upgrade pip setuptools==41.0.0 wheel && pip install -r /requirements.txt \
https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp37-cp37m-linux_x86_64.whl \
Expand All @@ -99,8 +100,9 @@ RUN cd / && apt-get update && apt-get install -y git python3-edgetpu && git clon
https://github.com/google-coral/project-posenet.git && sed -i 's/sudo / /g' \
/project-posenet/install_requirements.sh && sh /project-posenet/install_requirements.sh
ENV PYTHONPATH=$PYTHONPATH:/project-posenet
ENV CONFIG_FILE=config-coral.ini

COPY . /repo
WORKDIR /repo
ENTRYPOINT ["bash", "start_services.bash"]
CMD ["config-coral.ini"]
HEALTHCHECK --interval=30s --retries=2 --start-period=15s CMD bash healthcheck.bash
CMD supervisord -c supervisord.conf -n
2 changes: 2 additions & 0 deletions config-coral.ini
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ EnableSlackNotifications = no
SlackChannel = lanthorn-notifications
; OccupancyAlertsMinInterval time is measured in seconds (if interval < 0 then no occupancy alerts are triggered)
OccupancyAlertsMinInterval = 180
MaxThreadRestarts = 5


[API]
Host = 0.0.0.0
Expand Down
1 change: 1 addition & 0 deletions config-jetson.ini
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ EnableSlackNotifications = no
SlackChannel = lanthorn-notifications
; OccupancyAlertsMinInterval time is measured in seconds (if interval < 0 then no occupancy alerts are triggered)
OccupancyAlertsMinInterval = 180
MaxThreadRestarts = 5

[API]
Host = 0.0.0.0
Expand Down
2 changes: 2 additions & 0 deletions config-x86-gpu.ini
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ DashboardURL = http://0.0.0.0:8000
ScreenshotsDirectory = /repo/data/processor/static/screenshots
EnableSlackNotifications = no
SlackChannel = lanthorn-notifications
OccupancyAlertsMinInterval = 180
MaxThreadRestarts = 5

[Area_0]
Id = area0
Expand Down
1 change: 1 addition & 0 deletions config-x86-openvino.ini
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ EnableSlackNotifications = no
SlackChannel = lanthorn-notifications
; OccupancyAlertsMinInterval time is measured in seconds (if interval < 0 then no occupancy alerts are triggered)
OccupancyAlertsMinInterval = 180
MaxThreadRestarts = 5

[Area_0]
Id = area0
Expand Down
1 change: 1 addition & 0 deletions config-x86.ini
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ EnableSlackNotifications = no
SlackChannel = lanthorn-notifications
; OccupancyAlertsMinInterval time is measured in seconds (if interval < 0 then no occupancy alerts are triggered)
OccupancyAlertsMinInterval = 180
MaxThreadRestarts = 5

[Area_0]
Id = area0
Expand Down
8 changes: 5 additions & 3 deletions coral-dev-board.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3-pip \
python3-scipy \
python3-wget \
supervisor \
&& rm -rf /var/lib/apt/lists/* \
&& python3 -m pip install --upgrade pip setuptools==41.0.0 && pip install -r /requirements.txt \
https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp37-cp37m-linux_aarch64.whl \
Expand All @@ -82,9 +83,10 @@ RUN cd / && apt-get update && apt-get install -y git python3-edgetpu && git clon
https://github.com/google-coral/project-posenet.git && sed -i 's/sudo / /g' \
/project-posenet/install_requirements.sh && sh /project-posenet/install_requirements.sh
ENV PYTHONPATH=$PYTHONPATH:/project-posenet

ENV CONFIG_FILE=config-coral.ini
# Also if you use opencv: LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libgomp.so.1.0.0"

COPY . /repo
WORKDIR /repo
ENTRYPOINT ["bash", "start_services.bash"]
CMD ["config-coral.ini"]
HEALTHCHECK --interval=30s --retries=2 --start-period=15s CMD bash healthcheck.bash
CMD supervisord -c supervisord.conf -n
6 changes: 6 additions & 0 deletions healthcheck.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
if fatalError=$(supervisorctl -c supervisord.conf status all | grep -i "FATAL\|UNKNOWN"); then
exit 1;
else
exit 0;
fi
6 changes: 4 additions & 2 deletions jetson-nano.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3-pip \
python3-scipy \
python3-wget \
supervisor \
&& rm -rf /var/lib/apt/lists/* \
&& ln -sf $(which gcc) /usr/local/bin/gcc-aarch64-linux-gnu \
&& ln -sf $(which g++) /usr/local/bin/g++-aarch64-linux-gnu \
Expand All @@ -96,8 +97,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
ENV DEV_ALLOW_ALL_ORIGINS=true
ENV AWS_SHARED_CREDENTIALS_FILE=/repo/.aws/credentials
ENV AWS_CONFIG_FILE=/repo/.aws/config
ENV CONFIG_FILE=config-jetson.ini

COPY . /repo/
WORKDIR /repo
ENTRYPOINT ["bash", "start_services.bash"]
CMD ["config-jetson.ini"]
HEALTHCHECK --interval=30s --retries=2 --start-period=15s CMD bash healthcheck.bash
CMD supervisord -c supervisord.conf -n
6 changes: 4 additions & 2 deletions jetson-tx2.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3-pip \
python3-scipy \
python3-wget \
supervisor \
&& rm -rf /var/lib/apt/lists/* \
&& ln -sf $(which gcc) /usr/local/bin/gcc-aarch64-linux-gnu \
&& ln -sf $(which g++) /usr/local/bin/g++-aarch64-linux-gnu \
Expand All @@ -92,8 +93,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
ENV DEV_ALLOW_ALL_ORIGINS=true
ENV AWS_SHARED_CREDENTIALS_FILE=/repo/.aws/credentials
ENV AWS_CONFIG_FILE=/repo/.aws/config
ENV CONFIG_FILE=config-jetson.ini

COPY . /repo/
WORKDIR /repo
ENTRYPOINT ["bash", "start_services.bash"]
CMD ["config-jetson.ini"]
HEALTHCHECK --interval=30s --retries=2 --start-period=15s CMD bash healthcheck.bash
CMD supervisord -c supervisord.conf -n
33 changes: 0 additions & 33 deletions jetson-web-gui.Dockerfile

This file was deleted.

6 changes: 4 additions & 2 deletions libs/area_reporting.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@ def __init__(self, config, area):
camera['file_path'] = os.path.join(self.log_dir, camera['id'], "objects_log")
camera['last_processed_time'] = time.time()

self.mail_service = MailService(config)
self.slack_service = SlackService(config)
if self.should_send_email_notifications:
self.mail_service = MailService(config)
if self.should_send_slack_notifications:
self.slack_service = SlackService(config)

def process_area(self):
# Sleep for a while so cameras start processing
Expand Down
25 changes: 23 additions & 2 deletions libs/area_threading.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import os
import logging

import time

from datetime import datetime
from threading import Thread
from libs.area_reporting import AreaReporting as AreaEngine
import logging

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -33,7 +37,24 @@ def __init__(self, config, area):

def run(self):
self.engine = AreaEngine(self.config, self.area)
self.engine.process_area()
restarts = 0
max_restarts = int(self.config.get_section_dict("App")["MaxThreadRestarts"])
while True:
try:
last_restart_time = datetime.now()
self.engine.process_area()
except Exception as e:
logging.error(e, exc_info=True)
logging.info(f"Exception processing area {self.area['name']}")
if (datetime.now() - last_restart_time).total_seconds() > 60:
# If the last restart was previous than 1 minute ago, restart the counter.
restarts = 0
if restarts == max_restarts:
raise e
# Sleep the thread for 5 seconds and try to process the area again
time.sleep(5)
logging.info("Restarting the area processing")
restarts += 1

def stop(self):
self.engine.stop_process_area()
Expand Down
24 changes: 22 additions & 2 deletions libs/engine_threading.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import os
import logging
import time

from datetime import datetime
from shutil import rmtree
from threading import Thread
from libs.distancing import Distancing as CvEngine
import logging

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -43,7 +46,24 @@ def __init__(self, config, source, live_feed_enabled=True):

def run(self):
self.engine = CvEngine(self.config, self.source['section'], self.live_feed_enabled)
self.engine.process_video(self.source['url'])
restarts = 0
max_restarts = int(self.config.get_section_dict("App")["MaxThreadRestarts"])
while True:
try:
last_restart_time = datetime.now()
self.engine.process_video(self.source['url'])
except Exception as e:
logging.error(e, exc_info=True)
logging.info(f"Exception processing video for source {self.source['name']}")
if (datetime.now() - last_restart_time).total_seconds() > 60:
# If the last restart was previous than 1 minute ago, restart the counter.
restarts = 0
if restarts == max_restarts:
raise e
# Sleep the thread for 5 seconds and try to process the video again
time.sleep(5)
logging.info("Restarting the video processing")
restarts += 1

def stop(self):
self.engine.stop_process_video()
Expand Down
2 changes: 1 addition & 1 deletion sample_startup.bash
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ config="$1"

# check if video file exists, if not download it
line=$(cat $config | grep VideoPath)
videoPath="$(cut -d'=' -f2 <<<"$line")"
videoPath="$(cut -d'=' -f2 <<<"$line" | xargs)"
if [ ! -f "$videoPath" ]; then
echo "video file at $videoPath not exists, downloading..."
sh '/repo/download_sample_video.sh'
Expand Down
6 changes: 0 additions & 6 deletions start_services.bash

This file was deleted.

42 changes: 42 additions & 0 deletions supervisord.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
[supervisord]

[unix_http_server]
file=/var/run/supervisor.sock
chmod=0700

[rpcinterface:supervisor]
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface

[supervisorctl]
serverurl=unix:///var/run/supervisor.sock

[program:api]
command=python3 run_processor_api.py --config %(ENV_CONFIG_FILE)s
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
autorestart=true

[program:core]
command=python3 run_processor_core.py --config %(ENV_CONFIG_FILE)s
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
autorestart=true

[program:startup]
command=/repo/sample_startup.bash %(ENV_CONFIG_FILE)s
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0

[program:reports]
command=python3 create_reports.py --config %(ENV_CONFIG_FILE)s
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
autorestart=true
8 changes: 6 additions & 2 deletions x86-gpu.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3-pip \
python3-scipy \
python3-wget \
supervisor \
&& rm -rf /var/lib/apt/lists/* \
&& python3 -m pip install --upgrade pip setuptools==41.0.0 && pip install -r /requirements.txt \
&& apt-get purge -y \
Expand All @@ -80,7 +81,10 @@ ENV DEV_ALLOW_ALL_ORIGINS=true
ENV AWS_SHARED_CREDENTIALS_FILE=/repo/.aws/credentials
ENV AWS_CONFIG_FILE=/repo/.aws/config
ENV TF_FORCE_GPU_ALLOW_GROWTH=true
ENV CONFIG_FILE=config-x86-gpu.ini

COPY . /repo
WORKDIR /repo
ENTRYPOINT ["bash", "start_services.bash"]
CMD ["config-x86-gpu.ini"]

HEALTHCHECK --interval=30s --retries=2 --start-period=15s CMD bash healthcheck.bash
CMD supervisord -c supervisord.conf -n
6 changes: 5 additions & 1 deletion x86-openvino.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3-pip \
python3-scipy \
python3-wget \
supervisor \
&& rm -rf /var/lib/apt/lists/* \
&& python3 -m pip install --upgrade pip setuptools==41.0.0 wheel && pip install -r /requirements.txt \
&& apt-get purge -y \
Expand All @@ -82,7 +83,10 @@ ADD docker/x86-openvino/openvino_setupvars.py /opt/openvino_setupvars.py
ENV DEV_ALLOW_ALL_ORIGINS=true
ENV AWS_SHARED_CREDENTIALS_FILE=/repo/.aws/credentials
ENV AWS_CONFIG_FILE=/repo/.aws/config
ENV CONFIG_FILE=config-x86-openvino.ini

COPY . /repo
WORKDIR /repo
CMD env `python3 /opt/openvino_setupvars.py` bash start_services.bash config-x86-openvino.ini

HEALTHCHECK --interval=30s --retries=2 --start-period=15s CMD bash healthcheck.bash
CMD env `python3 /opt/openvino_setupvars.py` supervisord -c supervisord.conf -n
7 changes: 5 additions & 2 deletions x86.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3-pip \
python3-scipy \
python3-wget \
supervisor \
&& rm -rf /var/lib/apt/lists/* \
&& python3 -m pip install --upgrade pip setuptools==41.0.0 && pip install -r /requirements.txt \
&& apt-get purge -y \
Expand All @@ -79,8 +80,10 @@ RUN apt-get update && apt-get install -y python3-dev && pip3 install torch==1.5
ENV DEV_ALLOW_ALL_ORIGINS=true
ENV AWS_SHARED_CREDENTIALS_FILE=/repo/.aws/credentials
ENV AWS_CONFIG_FILE=/repo/.aws/config
ENV CONFIG_FILE=config-x86.ini

COPY . /repo
WORKDIR /repo
ENTRYPOINT ["bash", "start_services.bash"]
CMD ["config-x86.ini"]

HEALTHCHECK --interval=30s --retries=2 --start-period=15s CMD bash healthcheck.bash
CMD supervisord -c supervisord.conf -n

0 comments on commit bf82aad

Please sign in to comment.