forked from noi-techpark/bdp-elaborations
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch '4-draft-data-validation-as-dag' into 'main'
Draft data validation as DAG Closes noi-techpark#4 See merge request u-hopper/projects/industrial/open-data-hub-bz/bdp-elaborations!13
- Loading branch information
Showing
9 changed files
with
314 additions
and
134 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
# SPDX-FileCopyrightText: NOI Techpark <digital@noi.bz.it> | ||
# | ||
# SPDX-License-Identifier: AGPL-3.0-or-later | ||
from .common import VehicleClass, MeasureCollection, Measure, Provenance, DataType | ||
from .common import VehicleClass, MeasureCollection, Measure, Provenance, DataType, StationLatestMeasure, Station | ||
from .traffic import TrafficSensorStation, TrafficMeasure, TrafficMeasureCollection, TrafficEntry | ||
from .pollution import PollutionEntry, PollutantClass, PollutionMeasure, PollutionMeasureCollection |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# SPDX-FileCopyrightText: NOI Techpark <digital@noi.bz.it> | ||
# | ||
# SPDX-License-Identifier: AGPL-3.0-or-later | ||
|
||
from __future__ import absolute_import, annotations | ||
|
||
import itertools | ||
import logging | ||
from datetime import datetime | ||
from typing import List | ||
|
||
from common.connector.collector import ConnectorCollector | ||
from common.data_model import TrafficMeasureCollection, TrafficSensorStation, StationLatestMeasure | ||
from common.settings import ODH_MINIMUM_STARTING_DATE | ||
|
||
logger = logging.getLogger("common.manager.traffic_station") | ||
|
||
|
||
class TrafficStationManager: | ||
|
||
def __init__(self, connector_collector: ConnectorCollector): | ||
self._connector_collector = connector_collector | ||
self._traffic_stations: List[TrafficSensorStation] = [] | ||
|
||
def get_traffic_stations_from_cache(self) -> List[TrafficSensorStation]: | ||
if len(self._traffic_stations) == 0: | ||
logger.info("Retrieving station list from ODH") | ||
self._traffic_stations = self._get_station_list() | ||
return self._traffic_stations | ||
|
||
def get_all_latest_measures(self) -> List[StationLatestMeasure]: | ||
""" | ||
Returns a list of stations with its latest measure date. | ||
:return: List of stations with its latest measure date. | ||
""" | ||
all_measures = self._connector_collector.traffic.get_latest_measures() | ||
|
||
grouped = {} | ||
for station_code, values in itertools.groupby(all_measures, lambda m: m.station.code): | ||
tmp = list(values) | ||
if len(tmp) > 0: | ||
grouped[station_code] = tmp | ||
|
||
res = [] | ||
for key, value in grouped.items(): | ||
res.append(StationLatestMeasure(key, max(list(map(lambda m: m.valid_time, value)), | ||
default=ODH_MINIMUM_STARTING_DATE))) | ||
|
||
return res | ||
|
||
def _get_station_list(self) -> List[TrafficSensorStation]: | ||
""" | ||
Retrieve the list of all the available stations. | ||
""" | ||
return self._connector_collector.traffic.get_station_list() | ||
|
||
def _download_traffic_data(self, | ||
from_date: datetime, | ||
to_date: datetime, | ||
traffic_station: TrafficSensorStation | ||
) -> TrafficMeasureCollection: | ||
""" | ||
Download traffic data measures in the given interval. | ||
:param from_date: Traffic measures before this date are discarded if there isn't any latest pollution measure available. | ||
:param to_date: Traffic measure after this date are discarded. | ||
:return: The resulting TrafficMeasureCollection containing the traffic data. | ||
""" | ||
|
||
return TrafficMeasureCollection(measures=self._connector_collector.traffic.get_measures(from_date=from_date, to_date=to_date, station=traffic_station)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
# SPDX-FileCopyrightText: NOI Techpark <digital@noi.bz.it> | ||
# | ||
# SPDX-License-Identifier: AGPL-3.0-or-later | ||
|
||
import logging | ||
from datetime import timedelta, datetime | ||
|
||
from airflow.decorators import task | ||
|
||
from common.manager.traffic_station import TrafficStationManager | ||
from dags.common import TrafficStationsDAG | ||
from common.connector.collector import ConnectorCollector | ||
from common.data_model import TrafficSensorStation, StationLatestMeasure | ||
from common.settings import ODH_MINIMUM_STARTING_DATE, DAG_VALIDATION_EXECUTION_CRONTAB | ||
|
||
# see https://airflow.apache.org/docs/apache-airflow/stable/authoring-and-scheduling/dynamic-task-mapping.html | ||
|
||
logger = logging.getLogger("dags.aiaas_validator") | ||
|
||
default_args = { | ||
'owner': 'airflow', | ||
'depends_on_past': False, | ||
'start_date': ODH_MINIMUM_STARTING_DATE, | ||
'email': ['airflow@example.com'], | ||
'email_on_failure': False, | ||
'email_on_retry': False, | ||
'retries': 1, | ||
'retry_delay': timedelta(minutes=5), | ||
} | ||
|
||
THIS_DAG_ID = "validator" | ||
|
||
with TrafficStationsDAG( | ||
THIS_DAG_ID, | ||
|
||
# execution interval if no backfill step length on date increment if backfill (interval determined by first slot | ||
# available in queue) | ||
# schedule_interval is deprecated | ||
schedule=DAG_VALIDATION_EXECUTION_CRONTAB, | ||
|
||
# execution date starting at (if needed, backfill) | ||
start_date=ODH_MINIMUM_STARTING_DATE, | ||
|
||
# if True, the scheduler creates a DAG Run for each completed interval between start_date and end_date | ||
# and the scheduler will execute them sequentially | ||
# no need to backfill with catch-up, we can rely on programmatically process-the-oldest-data-on-ODH | ||
catchup=False, | ||
|
||
tags=["aiaas", "validator"], | ||
|
||
# 1 as the maximum number of active DAG runs per DAG: | ||
# dag execution mode should be sequential to avoid periods overlapping and | ||
# to avoid quick and recent runs blocking slow and older ones (as ODH does not accept "older" data writing) | ||
max_active_runs=1, | ||
|
||
default_args=default_args | ||
) as dag: | ||
|
||
def _init_manager() -> TrafficStationManager: | ||
|
||
connector_collector = ConnectorCollector.build_from_env() | ||
manager = TrafficStationManager(connector_collector) | ||
return manager | ||
|
||
@task | ||
def get_stations_list(**kwargs) -> list[dict]: | ||
""" | ||
Returns the complete list of stations or the filtered list based on previous DAG run | ||
:return: list of strings containing stations list | ||
""" | ||
manager = _init_manager() | ||
|
||
station_dicts = dag.get_stations_list(manager, **kwargs) | ||
|
||
return station_dicts | ||
|
||
|
||
@task | ||
def process_station(station_dict: dict, **kwargs): | ||
""" | ||
Process a single station | ||
:param station_dict: the station to process | ||
""" | ||
|
||
station = TrafficSensorStation.from_json(station_dict) | ||
logger.info(f"Received station {station}") | ||
|
||
manager = _init_manager() | ||
|
||
min_from_date, max_to_date = dag.init_date_range(None, None) | ||
|
||
computation_start_dt = datetime.now() | ||
|
||
logger.info(f"running validation from {min_from_date} to {max_to_date}") | ||
|
||
# TODO: implement validation | ||
|
||
computation_end_dt = datetime.now() | ||
logger.info(f"Completed validation in [{(computation_end_dt - computation_start_dt).seconds}]") | ||
|
||
|
||
@task | ||
def whats_next(already_processed_stations, **kwargs): | ||
""" | ||
Checks if there are still data to be processed before ending DAG runs | ||
:param already_processed_stations: the stations already processed (not used) | ||
""" | ||
manager = _init_manager() | ||
|
||
def has_remaining_data(measure: StationLatestMeasure) -> bool: | ||
# TODO: implement method to check if there are still data to be processed before ending DAG runs | ||
raise NotImplementedError | ||
|
||
dag.trigger_next_dag_run(manager, dag, has_remaining_data, **kwargs) | ||
|
||
|
||
processed_stations = process_station.expand(station_dict=get_stations_list()) | ||
|
||
whats_next(processed_stations) |
Oops, something went wrong.