Skip to content

Commit

Permalink
Publish ingestion metrics on statsd (#7875)
Browse files Browse the repository at this point in the history
* Add statsd service

* Add celery task to publish stats

* Add task to prod misc worker

* Fix beat task

* Aggregate stats

* Publish on statsd

* Fix date range filter

* Allow to change statsd host and port

* Add a prefix for stats publication
  • Loading branch information
vrigal authored Nov 21, 2023
1 parent c13a218 commit 307abe8
Show file tree
Hide file tree
Showing 10 changed files with 180 additions and 25 deletions.
3 changes: 3 additions & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,6 @@ ui/partials/

# Ignore markdown
*.md

# Statsd configuration file
docker/statsd_config.js
11 changes: 10 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ services:
- DATABASE_URL=mysql://root@mysql:3306/treeherder
- PROJECTS_TO_INGEST=${PROJECTS_TO_INGEST:-autoland,try}
entrypoint: './docker/entrypoint.sh'
command: celery -A treeherder worker --uid=nobody --gid=nogroup --without-gossip --without-mingle --without-heartbeat -Q store_pulse_pushes,store_pulse_tasks,store_pulse_tasks_classification --concurrency=1 --loglevel=INFO
command: celery -A treeherder worker --uid=nobody --gid=nogroup --without-gossip --without-mingle --without-heartbeat -Q store_pulse_pushes,store_pulse_tasks,store_pulse_tasks_classification,statsd --concurrency=1 --loglevel=INFO
volumes:
- .:/app
depends_on:
Expand All @@ -157,6 +157,15 @@ services:
- redis
- rabbitmq
platform: linux/amd64

statsd:
container_name: statsd
image: statsd/statsd:v0.10.2
volumes:
- ./docker/statsd_config.js:/usr/src/app/config.js
ports:
- '8125:8125'

volumes:
# TODO: Experiment with using tmpfs when testing, to speed up database-using Python tests.
mysql_data: {}
Expand Down
2 changes: 1 addition & 1 deletion docker/entrypoint_prod.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ elif [ "$1" == "worker_log_parser_fail_json_unsheriffed" ]; then
# Tasks that don't need a dedicated worker.
elif [ "$1" == "worker_misc" ]; then
export REMAP_SIGTERM=SIGQUIT
exec newrelic-admin run-program celery -A treeherder worker --without-gossip --without-mingle --without-heartbeat -Q default,generate_perf_alerts,pushlog --concurrency=3
exec newrelic-admin run-program celery -A treeherder worker --without-gossip --without-mingle --without-heartbeat -Q default,generate_perf_alerts,pushlog,statsd --concurrency=3

# Cron jobs
elif [ "$1" == "run_intermittents_commenter" ]; then
Expand Down
8 changes: 8 additions & 0 deletions docker/statsd_config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
debug: true,
healthStatus: true,
dumpMessages: true,
port: 8125,
// InfluxDB backend should be added to read stats
backends: []
}
3 changes: 3 additions & 0 deletions requirements/common.in
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,6 @@ moz-measure-noise==2.60.1

# Used in the intermittents commenter
jinja2==3.1.2

# Client to publish runtime statistics to statsd
statsd==4.0.1
30 changes: 7 additions & 23 deletions requirements/common.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#
# This file is autogenerated by pip-compile with python 3.9
# To update, run:
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile --generate-hashes --output-file=requirements/common.txt requirements/common.in
#
Expand Down Expand Up @@ -101,10 +101,6 @@ amqp==5.1.1 \
--hash=sha256:2c1b13fecc0893e946c65cbd5f36427861cffa4ea2201d8f6fca22e2a373b5e2 \
--hash=sha256:6f0956d2c23d8fa6e7691934d8c3930eadb44972cbbd1a7ae3a520f735d43359
# via kombu
ansicon==1.89.0 \
--hash=sha256:e4d039def5768a47e4afec8e89e83ec3ae5a26bf00ad851f914d1240b444d2b1 \
--hash=sha256:f1def52d17f65c2c9682cf8370c03f541f410c1752d6a14029f97318e4b9dfec
# via jinxed
appdirs==1.4.4 \
--hash=sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41 \
--hash=sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128
Expand Down Expand Up @@ -353,12 +349,6 @@ click-repl==0.3.0 \
--hash=sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9 \
--hash=sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812
# via celery
colorama==0.4.6 \
--hash=sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44 \
--hash=sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6
# via
# click
# loguru
crashtest==0.4.1 \
--hash=sha256:80d7b1f316ebfbd429f648076d6275c877ba30ba48979de4191714a75266f0ce \
--hash=sha256:8d23eac5fa660409f57472e3851dab7ac18aba459a8d19cbbba86d3d5aecd2a5
Expand Down Expand Up @@ -495,10 +485,6 @@ jinja2==3.1.2 \
--hash=sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852 \
--hash=sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61
# via -r requirements/common.in
jinxed==1.2.0 \
--hash=sha256:032acda92d5c57cd216033cbbd53de731e6ed50deb63eb4781336ca55f72cda5 \
--hash=sha256:cfc2b2e4e3b4326954d546ba6d6b9a7a796ddcb0aef8d03161d005177eb0d48b
# via blessed
jmespath==1.0.1 \
--hash=sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980 \
--hash=sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe
Expand Down Expand Up @@ -1281,6 +1267,10 @@ sqlparse==0.4.4 \
--hash=sha256:5430a4fe2ac7d0f93e66f1efc6e1338a41884b7ddf2a350cedd20ccc4d9d28f3 \
--hash=sha256:d446183e84b8349fa3061f0fe7f06ca94ba65b426946ffebe6e3e8295332420c
# via django
statsd==4.0.1 \
--hash=sha256:99763da81bfea8daf6b3d22d11aaccb01a8d0f52ea521daab37e758a4ca7d128 \
--hash=sha256:c2676519927f7afade3723aca9ca8ea986ef5b059556a980a867721ca69df093
# via -r requirements/common.in
tabulate==0.9.0 \
--hash=sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c \
--hash=sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f
Expand Down Expand Up @@ -1311,9 +1301,7 @@ typing-extensions==4.7.1 \
tzdata==2023.3 \
--hash=sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a \
--hash=sha256:7e65763eef3120314099b6939b5546db7adce1e7d6f2e179e3df563c70511eda
# via
# celery
# django
# via celery
uritemplate==4.1.1 \
--hash=sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0 \
--hash=sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e
Expand Down Expand Up @@ -1387,10 +1375,6 @@ whitenoise[brotli]==6.5.0 \
--hash=sha256:15fe60546ac975b58e357ccaeb165a4ca2d0ab697e48450b8f0307ca368195a8 \
--hash=sha256:16468e9ad2189f09f4a8c635a9031cc9bb2cdbc8e5e53365407acf99f7ade9ec
# via -r requirements/common.in
win32-setctime==1.1.0 \
--hash=sha256:15cf5750465118d6929ae4de4eb46e8edae9a5634350c01ba582df868e932cb2 \
--hash=sha256:231db239e959c2fe7eb1d7dc129f11172354f98361c4fa2d6d2d7e278baa8aad
# via loguru
yarl==1.9.2 \
--hash=sha256:04ab9d4b9f587c06d801c2abfe9317b77cdf996c65a90d5e84ecc45010823571 \
--hash=sha256:066c163aec9d3d073dc9ffe5dd3ad05069bcb03fcaab8d221290ba99f9f69ee3 \
Expand Down
52 changes: 52 additions & 0 deletions tests/test_worker/test_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import pytest
from unittest.mock import patch, MagicMock, call
from treeherder.workers.stats import publish_stats
from treeherder.model.models import Push, Job
from django.utils import timezone
from datetime import timedelta


@pytest.mark.django_db
@patch('treeherder.workers.stats.get_stats_client')
def test_publish_stats_nothing_to_do(get_worker_mock, django_assert_num_queries, caplog):
statsd_client = MagicMock()
get_worker_mock.return_value = statsd_client
assert Push.objects.count() == 0
assert Job.objects.count() == 0
with django_assert_num_queries(2):
publish_stats()
assert [(level, message) for _, level, message in caplog.record_tuples] == [
(20, 'Publishing runtime statistics to statsd'),
(20, 'Ingested 0 pushes'),
(20, 'Ingested 0 jobs in total'),
]
assert statsd_client.call_args_list == []


@pytest.mark.django_db
@patch('treeherder.workers.stats.get_stats_client')
def test_publish_stats(
get_worker_mock, eleven_jobs_stored_new_date, django_assert_num_queries, caplog, settings
):
"Test statsd statistics publication task"
settings.CELERY_STATS_PUBLICATION_DELAY = 10
statsd_client = MagicMock()
get_worker_mock.return_value = statsd_client
assert Push.objects.count() == 10
assert Job.objects.count() == 11
Push.objects.update(time=timezone.now() - timedelta(minutes=10))
Job.objects.update(end_time=timezone.now() - timedelta(minutes=10))

with django_assert_num_queries(2):
publish_stats()
assert [(level, message) for _, level, message in caplog.record_tuples] == [
(20, 'Publishing runtime statistics to statsd'),
(20, 'Ingested 10 pushes'),
(20, 'Ingested 11 jobs in total'),
]
assert statsd_client.incr.call_args_list == [
call('push', 10),
call('jobs', 11),
call('jobs_repo.mozilla-central', 11),
call('jobs_state.completed', 11),
]
1 change: 1 addition & 0 deletions treeherder/celery.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@

# Load task modules from all registered Django app configs.
app.autodiscover_tasks()
app.autodiscover_tasks(['treeherder.workers.stats'])
19 changes: 19 additions & 0 deletions treeherder/config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from treeherder.config.utils import connection_should_use_tls
from treeherder.middleware import add_headers_function
from celery.schedules import crontab

# TODO: Switch to pathlib once using Python 3.
SRC_DIR = dirname(dirname(dirname(abspath(__file__))))
Expand Down Expand Up @@ -342,6 +343,7 @@
routing_key='store_pulse_tasks_classification',
),
Queue('store_pulse_pushes', Exchange('default'), routing_key='store_pulse_pushes'),
Queue('statsd', Exchange('default'), routing_key='statsd'),
]

# Force all queues to be explicitly listed in `CELERY_TASK_QUEUES` to help prevent typos
Expand Down Expand Up @@ -378,6 +380,12 @@
CELERY_TASK_SOFT_TIME_LIMIT = 15 * 60
CELERY_TASK_TIME_LIMIT = CELERY_TASK_SOFT_TIME_LIMIT + 30

# Periodically publish runtime statistics on statsd (in minutes)
CELERY_STATS_PUBLICATION_DELAY = 5
assert (
0 < CELERY_STATS_PUBLICATION_DELAY < 60 and 60 % 10 == 0
), "Celery task must be a valid cron delay in minutes"

CELERY_BEAT_SCHEDULE = {
# this is just a failsafe in case the Pulse ingestion misses something
'fetch-push-logs-every-5-minutes': {
Expand All @@ -386,6 +394,12 @@
'relative': True,
'options': {"queue": "pushlog"},
},
'publish_stats': {
'task': 'publish-stats',
'schedule': crontab(minute=f'*/{CELERY_STATS_PUBLICATION_DELAY}'),
'relative': True,
'options': {'queue': 'statsd'},
},
}

# CORS Headers
Expand Down Expand Up @@ -490,5 +504,10 @@
# https://github.com/settings/tokens
GITHUB_TOKEN = env("GITHUB_TOKEN", default=None)

# Statsd server configuration
STATSD_HOST = env('STATSD_HOST', default='statsd')
STATSD_PORT = env('STATSD_PORT', default=8124)
STATSD_PREFIX = env('STATSD_PREFIX', default='treeherder')

# For dockerflow
BASE_DIR = SRC_DIR
76 changes: 76 additions & 0 deletions treeherder/workers/stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from celery import shared_task
from django.conf import settings
from django.utils import timezone
from datetime import timedelta
from django.db.models import Count
from treeherder.model.models import Push, Job
from itertools import groupby
import statsd
import logging

logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


def get_stats_client():
return statsd.StatsClient(
settings.STATSD_HOST, settings.STATSD_PORT, prefix=settings.STATSD_PREFIX
)


@shared_task(name='publish-stats')
def publish_stats():
"""
Publish runtime stats on statsd
"""
stats_client = get_stats_client()
logger.info('Publishing runtime statistics to statsd')
end_date = timezone.now()
# Round the date to the current date range
# This should not overlapse as the beat is set as a relative cron based delay in minutes
end_date = end_date - timedelta(
minutes=end_date.minute % settings.CELERY_STATS_PUBLICATION_DELAY,
seconds=end_date.second,
microseconds=end_date.microsecond,
)

start_date = end_date - timedelta(minutes=settings.CELERY_STATS_PUBLICATION_DELAY)
logger.debug(f'Reading data ingested from {start_date} to {end_date}')

# Nb of pushes
pushes_count = Push.objects.filter(time__lte=end_date, time__gt=start_date).count()
logger.info(f'Ingested {pushes_count} pushes')
if pushes_count:
stats_client.incr('push', pushes_count)

# Compute stats for jobs in a single request
jobs_stats = (
Job.objects.filter(end_time__lte=end_date, end_time__gt=start_date)
.values('push__repository__name', 'state')
.annotate(count=Count('id'))
.values_list('push__repository__name', 'state', 'count')
)
# nb of job total
jobs_total = sum(ct for _, _, ct in jobs_stats)
logger.info(f'Ingested {jobs_total} jobs in total')
if jobs_total:
stats_client.incr('jobs', jobs_total)

# nb of job per repo
jobs_per_repo = {
key: sum(ct for k, ct in vals)
for key, vals in groupby(sorted((repo, ct) for repo, _, ct in jobs_stats), lambda x: x[0])
}
logger.debug(f'Jobs per repo: {jobs_per_repo}')
for key, value in jobs_per_repo.items():
stats_client.incr(f'jobs_repo.{key}', value)

# nb of job per state
jobs_per_state = {
key: sum(ct for k, ct in vals)
for key, vals in groupby(sorted((state, ct) for _, state, ct in jobs_stats), lambda x: x[0])
}
logger.debug(f'Jobs per state : {jobs_per_state}')
for key, value in jobs_per_state.items():
stats_client.incr(f'jobs_state.{key}', value)

0 comments on commit 307abe8

Please sign in to comment.