Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates health check endpoint to include triggerer status #27755

Merged
merged 7 commits into from
Dec 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions airflow/api_connexion/endpoints/health_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from airflow.api_connexion.schemas.health_schema import health_schema
from airflow.api_connexion.types import APIResponse
from airflow.jobs.scheduler_job import SchedulerJob
from airflow.jobs.triggerer_job import TriggererJob

HEALTHY = "healthy"
UNHEALTHY = "unhealthy"
Expand All @@ -28,7 +29,9 @@ def get_health() -> APIResponse:
"""Return the health of the airflow scheduler and metadatabase."""
metadatabase_status = HEALTHY
latest_scheduler_heartbeat = None
latest_triggerer_heartbeat = None
scheduler_status = UNHEALTHY
triggerer_status: str | None = UNHEALTHY
try:
scheduler_job = SchedulerJob.most_recent_job()

Expand All @@ -38,13 +41,28 @@ def get_health() -> APIResponse:
scheduler_status = HEALTHY
except Exception:
metadatabase_status = UNHEALTHY
try:
triggerer_job = TriggererJob.most_recent_job()

if triggerer_job:
latest_triggerer_heartbeat = triggerer_job.latest_heartbeat.isoformat()
if triggerer_job.is_alive():
triggerer_status = HEALTHY
else:
triggerer_status = None
except Exception:
metadatabase_status = UNHEALTHY

payload = {
"metadatabase": {"status": metadatabase_status},
"scheduler": {
"status": scheduler_status,
"latest_scheduler_heartbeat": latest_scheduler_heartbeat,
},
"triggerer": {
"status": triggerer_status,
"latest_triggerer_heartbeat": latest_triggerer_heartbeat,
},
}

return health_schema.dump(payload)
9 changes: 8 additions & 1 deletion airflow/api_connexion/schemas/health_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,23 @@ class MetaDatabaseInfoSchema(BaseInfoSchema):


class SchedulerInfoSchema(BaseInfoSchema):
"""Schema for Metadatabase info."""
"""Schema for Scheduler info."""

latest_scheduler_heartbeat = fields.String(dump_only=True)


class TriggererInfoSchema(BaseInfoSchema):
"""Schema for Triggerer info."""

latest_triggerer_heartbeat = fields.String(dump_only=True)


class HealthInfoSchema(Schema):
"""Schema for the Health endpoint."""

metadatabase = fields.Nested(MetaDatabaseInfoSchema)
scheduler = fields.Nested(SchedulerInfoSchema)
triggerer = fields.Nested(TriggererInfoSchema)


health_schema = HealthInfoSchema()
8 changes: 8 additions & 0 deletions docs/apache-airflow/logging-monitoring/check-health.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ To check the health status of your Airflow instance, you can simply access the e
"scheduler":{
"status":"healthy",
"latest_scheduler_heartbeat":"2018-12-26 17:15:11+00:00"
},
"triggerer":{
"status":"healthy",
"latest_triggerer_heartbeat":"2018-12-26 17:16:12+00:00"
}
}

Expand All @@ -63,6 +67,10 @@ To check the health status of your Airflow instance, you can simply access the e
* If you run more than one scheduler, only the state of one scheduler will be reported, i.e. only one working scheduler is enough
for the scheduler state to be considered healthy

* The status of the ``triggerer`` behaves exactly like that of the ``scheduler`` as described above.
Note that the ``status`` and ``latest_triggerer_heartbeat`` fields in the health check response will be null for
deployments that do not include a ``triggerer`` component.

Please keep in mind that the HTTP response code of ``/health`` endpoint **should not** be used to determine the health
status of the application. The return code is only indicative of the state of the rest call (200 for success).

Expand Down