From c225adb02cb6358bcce8c65c89595cc9ac0700cd Mon Sep 17 00:00:00 2001 From: kaburia Date: Wed, 11 Oct 2023 11:52:17 +0300 Subject: [PATCH] v0.5.1 --- README.md | 1863 +---------------------------------- filter-stations/__init__.py | 384 +++++++- setup.py | 2 +- water_level_pipeline.md | 1851 ++++++++++++++++++++++++++++++++++ 4 files changed, 2199 insertions(+), 1901 deletions(-) create mode 100644 water_level_pipeline.md diff --git a/README.md b/README.md index 56db659..66708dc 100644 --- a/README.md +++ b/README.md @@ -1,1858 +1,15 @@ + ## Documentation +You can find the documentation for the project by following this link
https://filter-stations.netlify.app/ -## Installation -``` -pip install filter-stations -``` - -## Water Level Pipeline -- A series of functions to be added to the filter-stations module in pypi to evalute which TAHMO stations to use that corroborates with the water level -- All begins with the coordinates of the gauging station(location of the monitoring sensor) - - -```python -import os -from pathlib import Path -import haversine as hs -import pandas as pd -import numpy as np -import datetime -import statsmodels.api as sm -from matplotlib.dates import DateFormatter -import matplotlib.pyplot as plt -import warnings -import dateutil.parser -warnings.filterwarnings('ignore') - -# config_path -config_path = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'config.json') -``` - - -```python -from filter_stations import retreive_data, Interactive_maps, Filter, pipeline -import json -# Authentication -with open(config_path) as f: - conf = json.load(f) - -apiKey = conf['apiKey'] -apiSecret = conf['apiSecret'] -map_api_key = conf['map_api_key'] -fs = retreive_data(apiKey, apiSecret, map_api_key) -pipe = pipeline(apiKey, apiSecret, map_api_key) -maps = Interactive_maps(apiKey, apiSecret, map_api_key) -``` - -### Loading data -Load the water level data from the github repository[Link here]
-Load the TAHMO station data from the [Link here]
- - -```python -# muringato -muringato_loc = [-0.406689, 36.96301] -# ewaso -ewaso_loc = [0.026833, 36.914637] - -# Weather stations data -weather_stations_data = pd.read_csv(os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'stations_precipitation.csv')) - -''' The water level data ''' -# muringato data sensor 2 2021 -muringato_data_s2_2021 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water_data_2021', 'muringato-sensor2.csv') - -# muringato data sensor 2 2022 -muringato_data_s2_2022 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water_data_2021', 'muringato-sensor2-2022.csv') - -# muringato data sensor 6 2021 -muringato_data_s6_2021 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water_data_2021', 'muringato-sensor6.csv') - -# muringato data sensor 6 2022 -muringato_data_s6_2022 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water_data_2021', 'muringato-sensor6-2022.csv') - - -# ewaso data sensor 2020 convert the time column to datetime -ewaso_data_2020 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water-level-data-ewaso', '1E2020.csv') - -# ewaso data sensor 2022 -ewaso_data_2022 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water-level-data-ewaso', '1E2022.csv') - -weather_stations_data.Date = weather_stations_data.Date.astype('datetime64[ns]') -weather_stations_data.set_index('Date', inplace=True) - -``` - -To format water level it needs to have a time column and water level column the names can be different but the order must be that - - -```python -# handle the water level data -def format_water_level(water_level_data_path): - # data needs to be in the format time, data/water_level or whatever the column is called - water_level_data = pd.read_csv(water_level_data_path) - # rename the first column to time - water_level_data.rename(columns={water_level_data.columns[0]: 'time'}, inplace=True) - # convert the time column to datetime - water_level_data.time = pd.to_datetime([dateutil.parser.parse(i).strftime('%d-%m-%Y') for i in water_level_data['time']]) - water_level_data.time = water_level_data.time.astype('datetime64[ns]') - # rename the column to water_level - water_level_data.rename(columns={water_level_data.columns[1]: 'water_level'}, inplace=True) - # set the time column as the index - water_level_data.set_index('time', inplace=True) - return water_level_data -``` - - -```python -muringato_data_s2_2021 = format_water_level(muringato_data_s2_2021) -muringato_data_s2_2022 = format_water_level(muringato_data_s2_2022) -muringato_data_s6_2021 = format_water_level(muringato_data_s6_2021) -muringato_data_s6_2022 = format_water_level(muringato_data_s6_2022) -ewaso_data_2020 = format_water_level(ewaso_data_2020) -ewaso_data_2022 = format_water_level(ewaso_data_2022) - -``` - -1. Filter the date range based on the water level data from first day of the water level data to the last day of the water level data -2. Choose stations within a certain radius of the gauging station 100 km for example get the resulting weather data -3. Get the stations with only 100 percent data no missing data -4. Remove the stations data with the value zero from beginning to end if the water level data has some values above zero -5. Calculate the correlation between the water level data and the weather data needs to be above 0 and have a lag of maximum 3 days -6. Plot the resulting figures - - -### Choosing ewaso 2020 range -removing stations with missing data reduces from 1035 to 849 columns
-removing all zeros reduces from 849 to 604 columns
-columns with positive correlation reduces the number from 604 columns to 283 columns
-checking for lag reduces the columns to 80 - - -```python -above, below = pipe.shed_stations(weather_stations_data, - muringato_data_s6_2022, - muringato_loc, - 100, - lag=3 - ) - -``` - - -```python -below_stations = [i.split('_')[0] for i in below.keys()] -print(below_stations) -below_stations_metadata = fs.get_stations_info(multipleStations=below_stations)[['code', 'location.latitude', 'location.longitude']] -``` - - ['TA00001', 'TA00023', 'TA00024', 'TA00025', 'TA00054', 'TA00056', 'TA00067', 'TA00077', 'TA00129', 'TA00147', 'TA00154', 'TA00155', 'TA00156', 'TA00166', 'TA00171', 'TA00189', 'TA00215', 'TA00222', 'TA00228', 'TA00230', 'TA00233', 'TA00250', 'TA00270', 'TA00270', 'TA00272', 'TA00272', 'TA00316', 'TA00317', 'TA00355', 'TA00459', 'TA00473', 'TA00480', 'TA00493', 'TA00494', 'TA00577', 'TA00601', 'TA00621', 'TA00653', 'TA00672', 'TA00676', 'TA00679', 'TA00692', 'TA00699', 'TA00704', 'TA00705', 'TA00711', 'TA00712', 'TA00712', 'TA00715', 'TA00717', 'TA00750', 'TA00751', 'TA00767'] - - - -```python -below_stations_metadata['distance']= below_stations_metadata.apply(lambda row: hs.haversine((muringato_loc[0], - muringato_loc[1]), (row['location.latitude'], - row['location.longitude'])), axis=1) -below_stations_metadata.sort_values(by='distance') -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
codelocation.latitudelocation.longitudedistance
52TA00056-0.72165637.14558540.480889
22TA00024-1.07173137.04557874.517013
150TA00166-0.31950837.65913978.009238
172TA00189-0.79526037.66593089.304790
230TA00250-0.77894037.67673889.504935
600TA00715-1.22561836.80906592.655456
565TA00679-1.27083536.72391699.698089
23TA00025-1.30183936.760200102.058383
422TA00473-0.51237135.956813112.495996
513TA00621-1.63302037.146185137.874253
51TA00054-0.23934235.728897138.480985
424TA00480-1.37615237.797646142.238019
61TA00067-1.79428537.621211170.765765
140TA00156-1.70112338.068339189.255406
71TA00077-0.38306635.068406210.682047
139TA00155-2.52303736.829437235.795373
21TA00023-2.38855038.040767250.831198
155TA00171-0.00271034.596908266.903936
291TA003170.04044034.371716292.394991
0TA00001-1.12328334.397992296.112467
652TA00767-2.67199038.369665296.467402
290TA003160.28986234.371222298.418648
131TA001470.44927434.282303312.905564
117TA00129-3.39092637.717656342.264311
138TA00154-4.23110737.847804436.466702
211TA002301.72469033.622000440.623881
329TA003553.49806935.843897451.651266
544TA006530.26506232.627203487.869319
196TA002150.05246532.440690505.441217
203TA002221.18624032.020330577.409865
584TA00699-0.70757031.402138619.216128
558TA00672-6.18030237.146832642.321296
597TA00712-6.67630839.131552737.484276
562TA00676-6.78037438.973512742.978650
635TA00750-6.80531639.139843751.347364
636TA00751-6.84866839.082174753.892793
432TA00494-6.83386039.167475755.338586
248TA00270-6.84239039.156760755.852180
250TA00272-6.89003939.117927759.501414
431TA00493-6.91084539.075597760.236606
214TA002333.45350031.251250766.277105
209TA002283.40472030.959600790.422401
498TA00601-14.08014833.9075931557.147407
602TA007173.89830511.8864372827.236339
590TA007054.9522518.3416923234.191975
481TA0057710.4871479.7882233240.086078
589TA007045.3786026.9982923388.907422
596TA007114.9065306.9170643389.011984
410TA004599.0661486.5690803526.820348
577TA006926.4041145.6263073559.025765
-
- - - - -```python -# Interactive visuals -import plotly.express as px -import plotly.graph_objects as go - -fig = px.scatter_mapbox(below_stations_metadata, - lat="location.latitude", - lon="location.longitude", - hover_name="code", - hover_data=["distance"], - color_discrete_sequence=["fuchsia"], - zoom=8, - height=800, - ) -# update marker size -fig.update_traces(marker=dict(size=10)) -# add a point for the central station -fig.add_trace(go.Scattermapbox( - lat=[muringato_loc[0]], - lon=[muringato_loc[1]], - mode='markers', - marker=go.scattermapbox.Marker( - size=14 - ), - text=['Muringato gauging station'], - )) - -fig.update_layout( - mapbox_style="carto-positron", - margin={"r":0,"t":0,"l":0,"b":0}, - showlegend=False -) -fig.show() -``` - - - - -```python -pipe.plot_figs( - weather_stations_data, - list(muringato_data_s6_2022['water_level']), - list(below.keys()), - date=dateutil.parser.parse(str(muringato_data_s6_2022.index[0])).strftime('%d-%m-%Y'), - save=False -) -``` - - Begin plotting! - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_1.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_2.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_3.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_4.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_5.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_6.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_7.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_8.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_9.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_10.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_11.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_12.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_13.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_14.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_15.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_16.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_17.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_18.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_19.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_20.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_21.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_22.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_23.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_24.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_25.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_26.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_27.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_28.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_29.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_30.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_31.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_32.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_33.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_34.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_35.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_36.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_37.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_38.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_39.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_40.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_41.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_42.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_43.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_44.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_45.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_46.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_47.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_48.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_49.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_50.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_51.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_52.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_15_53.png) - - - - -```python -RADIUS = 100 - -ewaso_weather_data_2020 = weather_stations_data.loc[ewaso_data_2020.index[0]:ewaso_data_2020.index[-1]] -# ewaso stations within a particular radius -ewaso_tahmo_stations_2020 = pipe.stations_within_radius(RADIUS, ewaso_loc[0], ewaso_loc[1], df=False) -# Get stations without missing data -# ewaso weather data -ewaso_weather_data_2020_filtered = pipe.stations_data_check(stations_list=list(ewaso_tahmo_stations_2020), - percentage=1, data=ewaso_weather_data_2020 - ) -# Check the sum of each column and drop columns with a sum of zero this is if the sum of water level is not equal to zero -ewaso_weather_data_2020_filtered = ewaso_weather_data_2020_filtered.loc[:, ewaso_weather_data_2020_filtered.sum() != 0] -``` - - API request: services/assets/v2/stations - - - -```python -import statsmodels.api as sm -def calculate_lag(weather_stations_data, water_level_data, lag=3, above=None, below=None): - above_threshold_lag = dict() - below_threshold_lag = dict() - for cols in weather_stations_data.columns: - # check for positive correlation if not skip the column - if weather_stations_data[cols].corr(water_level_data['water_level']) <= 0: - continue - # get the lag and the coefficient for columns with a positive correlation - coefficient_list = list(sm.tsa.stattools.ccf(weather_stations_data[cols], water_level_data['water_level'])) - a = np.argmax(coefficient_list) - b = coefficient_list[a] - # print(f'{cols} has a lag of {a}') - # print(f'{cols} has a coefficient of {b}') - # print('-----------------------') - if a > lag: - above_threshold_lag[cols] = a - elif a <= lag: - below_threshold_lag[cols] = a - if above: - return above_threshold_lag - elif below: - return below_threshold_lag - else: - return above_threshold_lag, below_threshold_lag - - -``` - -Bringing all the functions together to create a pipeline - - -```python -def shed_stations(weather_stations_data, water_level_data, - gauging_station_coords, radius, lag=3, - percentage=1, above=None, below=None): - # Filter the date range based on the water level data from first day of the water level data to the last day of the water level data - weather_stations_data = weather_stations_data.loc[water_level_data.index[0]:water_level_data.index[-1]] - # Filter the weather stations based on the radius - lat, lon = gauging_station_coords[0], gauging_station_coords[1] - weather_stations_data_list = pipe.stations_within_radius(radius, lat, lon, df=False) - # get stations without missing data or the percentage of stations with missing data - weather_stations_data_filtered = pipe.stations_data_check(stations_list=weather_stations_data_list, - percentage=percentage, - data=weather_stations_data) - # Check the sum of each column and drop columns with a sum of zero this is if the sum of water level is not equal to zero - weather_stations_data_filtered = weather_stations_data_filtered.loc[:, weather_stations_data_filtered.sum() != 0] - - # Filter the weather stations based on the lag and positive correlation - above_threshold_lag, below_threshold_lag = calculate_lag(weather_stations_data_filtered, water_level_data, lag=lag) - - return above_threshold_lag, below_threshold_lag -``` - - -```python -above_threshold_lag, below_threshold_lag = shed_stations(weather_stations_data, ewaso_data_2020, ewaso_loc, RADIUS, lag=3, percentage=1, above=True, below=False) -len(below_threshold_lag) -``` - - API request: services/assets/v2/stations - - - - - - 80 - - - -### Plot the figures - - -```python -pipe.plot_figs( - weather_stations_data, - list(ewaso_data_2020['water_level']), - list(below_threshold_lag.keys()), - date=dateutil.parser.parse(str(ewaso_data_2020.index[0])).strftime('%d-%m-%Y'), - save=True -) -``` - - Begin plotting! - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_1.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_2.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_3.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_4.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_5.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_6.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_7.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_8.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_9.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_10.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_11.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_12.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_13.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_14.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_15.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_16.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_17.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_18.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_19.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_20.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_21.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_22.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_23.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_24.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_25.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_26.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_27.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_28.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_29.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_30.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_31.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_32.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_33.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_34.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_35.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_36.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_37.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_38.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_39.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_40.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_41.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_42.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_43.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_44.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_45.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_46.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_47.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_48.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_49.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_50.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_51.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_52.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_53.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_54.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_55.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_56.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_57.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_58.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_59.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_60.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_61.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_62.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_63.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_64.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_65.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_66.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_67.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_68.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_69.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_70.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_71.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_72.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_73.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_74.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_75.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_76.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_77.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_78.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_79.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_22_80.png) - - - -Input water level data
-Input TAHMO station data
- - - -```python -# plot the two with different colors -fig, ax = plt.subplots(figsize=(10, 10)) -muringato_tahmo_stations.plot(kind='scatter', - x='location.longitude', - y='location.latitude', - color='blue', - alpha=0.7, - ax=ax) -ewaso_tahmo_stations.plot(kind='scatter', - x='location.longitude', - y='location.latitude', - color='red', - alpha=0.7, - ax=ax) -plt.show() -``` - - - -![png](water_level_pipeline_files/water_level_pipeline_24_0.png) - - - -Apart from the completeness another method of validation by eliminating unusable sensors is checking for a positive correlation and lag -- The default lag is 3 days between a particular station and the gauging station -- The required format is a timeseries data -- Provide the column names for evaluation format = [Date, data] -- with the change in parameters one can choose above or below threshold - - -```python -def plot_figs(weather_stations, water_list, threshold_list, save=False, dpi=500, date='11-02-2021'): - start_date = datetime.datetime.strptime(date, "%d-%m-%Y") - end_date = start_date + datetime.timedelta(len(water_list)-1) - # weather_stations = weather_stations.set_index('Date') - df_plot = weather_stations[start_date:end_date] - df_plot = df_plot[threshold_list].reset_index() - df_plot.rename(columns={'index':'Date'}, inplace=True) - - - plt.rcParams['figure.figsize'] = (15, 9) - print('Begin plotting!') - - for cols in df_plot.columns[1:]: - fig, ax1 = plt.subplots() - color = 'tab:blue' - ax1.set_xlabel(f'Time', fontsize=24, weight='bold') - ax1.set_ylabel(f'Rainfall {cols} (mm)', color=color, fontsize=24, weight='bold') - ax1.bar(pd.to_datetime(df_plot['Date'], format="%d/%m/%Y"), df_plot[f'{cols}'], color=color, width=4, alpha=1.0) - ax1.tick_params(axis='y', labelcolor=color, labelsize=24) - ax1.tick_params(axis='x') - ax1.set_xticklabels(df_plot['Date'], fontsize=21, weight='bold') - ax1.grid(color='gray', linestyle='--', linewidth=0.8) - ax1.set(facecolor="white") - ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis - - color = 'tab:red' - ax2.set_ylabel('Water level/Stage (m)', color=color, fontsize=24, weight='bold') - ax2.plot(pd.to_datetime(df_plot['Date'], format="%d/%m/%Y"), water_list, color=color, linewidth=4) - ax2.tick_params(axis='y', labelcolor=color, labelsize=24) - ax2.set(facecolor="white") - plt.title('Stage and Rainfall against Time', fontsize=22, weight='bold') - - date_form = DateFormatter("%m-%y") - ax1.xaxis.set_major_formatter(date_form) - fig.tight_layout() - - if save: - fig.savefig(f'{cols}.png', dpi=dpi) - -``` - - -```python -plot_figs(stations_df, lag_[list(lag_.keys())[0]]['water_list'], list(lag_.keys()), save=True, date='12-05-2020') -``` - - Begin plotting! - - - - -![png](water_level_pipeline_files/water_level_pipeline_27_1.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_27_2.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_27_3.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_27_4.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_27_5.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_27_6.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_27_7.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_27_8.png) - - - - - -![png](water_level_pipeline_files/water_level_pipeline_27_9.png) - - - -Format to get the stations maetadata - - -```python -def filter_metadata(lag_keys): - captured_list = [i.split('_')[0] for i in list(lag_keys)] - return fs.get_stations_info(multipleStations=captured_list) -``` - - -```python -filter_metadata(list(lag_.keys())) -``` - - API request: services/assets/v2/stations - - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
codestatusinstallationdateelevationgroundsensorinstallationsdataloggerinstallationscreatoridcreatedupdateridupdated...location.countrycodelocation.zipcodelocation.latitudelocation.longitudelocation.elevationmsllocation.notelocation.creatoridlocation.createdlocation.updateridlocation.updated
26TA0002812015-08-31T00:00:00Z9.0NoneNone22018-12-11T08:35:17.888233Z22018-12-11T08:35:17.888233Z...KE0.05521937.1367472003.6{}22018-10-26T13:32:16.15537Z372022-06-30T11:11:50.27135Z
27TA0002912015-09-02T00:00:00Z2.0NoneNone22018-12-11T08:36:19.30342Z22018-12-11T08:36:19.30342Z...KE-0.50077636.5875112545.8{}22018-10-26T13:33:31.451613Z372022-02-28T12:25:09.578242Z
53TA0005712015-10-08T00:00:00Z2.0NoneNone22018-12-11T09:21:29.092833Z22018-12-11T09:21:29.092833Z...KE-1.25303036.8564871645.3{}22018-10-29T09:13:33.768613Z22022-07-26T07:34:06.603938Z
68TA0007412015-11-19T00:00:00Z2.0NoneNone22018-12-11T09:38:25.742397Z22018-12-11T09:38:25.742397Z...KE-0.56608037.0744121726.8{}22018-10-29T10:35:28.49617Z22022-07-26T07:38:42.100985Z
74TA0008012016-01-28T00:00:00Z2.0NoneNone22018-12-11T09:43:10.523398Z22018-12-11T09:43:10.523398Z...KE-1.08758936.8184021777.3{}22018-10-29T10:53:47.845042Z372022-02-28T13:07:04.709903Z
150TA0016612017-05-11T00:00:00Z2.0NoneNone22018-12-12T08:29:28.10697Z22018-12-12T08:29:28.10697Z...KE-0.31950837.6591391404.0{}22018-11-10T08:47:37.949135Z22018-11-10T08:47:37.949135Z
-

6 rows × 28 columns

-
- - - -```python +Getting Started +--------------- +All methods require an API key and secret, which can be obtained by contacting TAHMO.
+- The ```retreive_data``` class is used to retrieve data from the TAHMO API endpoints.
+- The ```Filter``` class is used to filter weather stations data based on things like distance and region.
+- The ```pipeline``` class is used to create a pipeline of filters to apply to weather stations based on how they correlate with water level data.
+- The ```Interactive_maps``` class is used to plot weather stations on an interactive map.
-``` +For instructions on shedding weather stations based on your water level data and gauging station coordinates, please refer to the [water_level_pipeline.md](https://github.com/kaburia/filter-stations/water_level_pipeline.md) file. \ No newline at end of file diff --git a/filter-stations/__init__.py b/filter-stations/__init__.py index 6b08a63..259acca 100644 --- a/filter-stations/__init__.py +++ b/filter-stations/__init__.py @@ -1,3 +1,36 @@ +""" +Installation +------------ +To install the package, run the following command in your terminal: +```bash +pip install -U filter-stations +``` +Getting Started +--------------- +All methods require an API key and secret, which can be obtained by contacting TAHMO.
+- The ```retreive_data``` class is used to retrieve data from the TAHMO API endpoints.
+- The ```Filter``` class is used to filter weather stations data based on things like distance and region.
+- The ```pipeline``` class is used to create a pipeline of filters to apply to weather stations based on how they correlate with water level data.
+- The ```Interactive_maps``` class is used to plot weather stations on an interactive map.
+ +```python +# Import the necessary modules +from filter_stations import retreive_data, Filter, pipeline, Interactive_maps + +# Define the API key and secret +apiKey = 'your_api_key' # request from TAHMO +apiSecret = 'your_api_secret' # request from TAHMO +maps_key = 'your_google_maps_key' # retrieve from google maps platform + +# Initialize the class +ret = retreive_data(apiKey, apiSecret, maps_key) +fs = Filter(apiKey, apiSecret, maps_key) +pipe = pipeline(apiKey, apiSecret, maps_key) +maps = Interactive_maps(apiKey, apiSecret, maps_key) +``` + + +""" import requests import urllib.parse import pandas as pd @@ -46,7 +79,6 @@ # Get data class class retreive_data: - # initialize the class def __init__(self, apiKey, apiSecret, api_key): self.apiKey = apiKey self.apiSecret = apiSecret @@ -91,6 +123,21 @@ def get_stations_info(self, station=None, multipleStations=[], countrycode=None) ----------- - pandas.DataFrame: DataFrame containing information about the requested weather stations. + Usage: + ----------- + To retrieve information about a single station: + ```python + station_info = ret.get_stations_info(station='TA00001') + ``` + To retrieve information about multiple stations: + ```python + station_info = ret.get_stations_info(multipleStations=['TA00001', 'TA00002']) + ``` + To retrieve information about all stations in a country: + ```python + station_info = ret.get_stations_info(countrycode='KE') + ``` + """ # Make API request and convert response to DataFrame response = self.__request(endpoints['STATION_INFO'], {'sort':'code'}) @@ -148,9 +195,6 @@ def __splitDateRange(self, inputStartDate, inputEndDate): df['end'].iloc[-1] = pd.Timestamp(endDate) return df - def raw_measurements(self, station, startDate=None, endDate=None, variables=None): - return self.get_measurements(station, startDate=startDate, endDate=endDate, variables=variables, dataset='raw') - def k_neighbours(self, station, number=5): """ Returns a dictionary of the nearest neighbouring stations to the specified station. @@ -237,29 +281,57 @@ def trained_models(self, columns=None): return self.__handleApiError(apiRequest) - def aggregate_variables(self, dataframe): + def aggregate_variables(self, dataframe, freq='1D'): """ Aggregates a pandas DataFrame of weather variables by summing values across each day. Parameters: ----------- - dataframe (pandas.DataFrame): DataFrame containing weather variable data. + - freq (str, optional): Frequency to aggregate the data by. Defaults to '1D'. Returns: ----------- - pandas.DataFrame: DataFrame containing aggregated weather variable data, summed by day. + + Usage: + ----------- + Define the DataFrame containing the weather variable data: + ```python + dataframe = ret.get_measurements('TA00001', '2020-01-01', '2020-01-31', ['pr']) # data comes in 5 minute interval + ``` + To aggregate data hourly: + ```python + hourly_data = ret.aggregate_variables(dataframe, freq='1H') + ``` + To aggregate data by 12 hours: + ```python + half_day_data = ret.aggregate_variables(dataframe, freq='12H') + ``` + To aggregate data by day: + ```python + daily_data = ret.aggregate_variables(dataframe, freq='1D') + ``` + To aggregate data by week: + ```python + weekly_data = ret.aggregate_variables(dataframe, freq='1W') + ``` + To aggregate data by month: + ```python + monthly_data = ret.aggregate_variables(dataframe, freq='1M') + ``` """ dataframe = dataframe.reset_index() dataframe.rename(columns={'index':'Date'}, inplace=True) # check if the column is all nan if dataframe.iloc[:, 1].isnull().all(): return dataframe.groupby(pd.Grouper(key='Date', axis=0, - freq='1D')).agg({f'{dataframe.columns[1]}': + freq=freq)).agg({f'{dataframe.columns[1]}': lambda x: np.nan if x.isnull().all() else x.isnull().sum()}) else: return dataframe.groupby(pd.Grouper(key='Date', axis=0, - freq='1D')).sum() + freq=freq)).sum() # aggregate qualityflags def aggregate_qualityflags(self, dataframe): @@ -285,7 +357,7 @@ def aggregate_qualityflags(self, dataframe): # Get the variables only - def get_measurements(self, station, startDate=None, endDate=None, variables=None, dataset='controlled', aggregate=False, quality_flags=False): + def get_measurements(self, station, startDate=None, endDate=None, variables=None, dataset='controlled', aggregate='5min', quality_flags=False): """ Get measurements from a station. @@ -303,6 +375,36 @@ def get_measurements(self, station, startDate=None, endDate=None, variables=None ----------- - A DataFrame containing the measurements. + Usage: + ----------- + To retrieve precipitation data for a station for the last month: + ```python + from datetime import datetime, timedelta + + # Get today's date + today = datetime.now() + + # Calculate one month ago + last_month = today - timedelta(days=30) + + # Format date as a string + last_month_str = last_month.strftime('%Y-%m-%d') + today_str = today.strftime('%Y-%m-%d') + + # Define the station you want to retrieve data from + station = 'TA00001' + variables = ['pr'] + dataset = 'raw' + + # aggregate the data to 30 minutes interval + aggregate = '30min' + + # Call the get_measurements method to retrieve and aggregate data + TA00001_data = ret.get_measurements(station, last_month_str, + today_str, variables, + dataset, aggregate) + ``` + """ #print('Get measurements', station, startDate, endDate, variables) endpoint = 'services/measurements/v2/stations/%s/measurements/%s' % (station, dataset) @@ -422,8 +524,11 @@ def get_measurements(self, station, startDate=None, endDate=None, variables=None # Merge all series together. if len(series) > 0: df = pd.concat(series, axis=1, sort=True) + else: df = pd.DataFrame() + + # Clean up memory. del series @@ -431,46 +536,80 @@ def get_measurements(self, station, startDate=None, endDate=None, variables=None # check if dataframe is empty if df.empty: # add the date range in the dataframe and the column as the station filled with NaN - df = pd.DataFrame(index=pd.date_range(start=startDate, end=endDate, tz='UTC', freq='5min'), columns=[f'{station}']) - if quality_flags: - if aggregate: - return self.aggregate_qualityflags(df) - else: - return df + df = pd.DataFrame(index=pd.date_range(start=startDate, end=endDate, tz='UTC', freq=aggregate), columns=[f'{station}']) + # remove the last row + return df[:-1] + else: - if aggregate: - return self.aggregate_variables(df) - else: - return df - - # retrieve data from multiple at a time - def retrieve_data(self, station, startDate, endDate, variables, dataset, aggregate): - try: - data = self.get_measurements(station, startDate, endDate, variables, dataset, aggregate) - return data - except Exception as e: - return station, str(e) - - def multiple_measurements(self, stations_list, csv_file, startDate, endDate, variables, dataset='controlled', aggregate=True): + # remove the last row + df = df[:-1] # lacks values for the last day + return self.aggregate_variables(df, freq=aggregate) + + def multiple_measurements(self, + stations_list, + startDate, + endDate, + variables, + dataset='controlled', + csv_file=None, + aggregate='1D'): """ - Retrieves measurements for multiple stations and saves the aggregated data to a CSV file. + Retrieves measurements for multiple stations within a specified date range. Parameters: ----------- - - stations_list (list): A list of strings containing the names of the stations to retrieve data from. - - csv_file (str): The name of the CSV file to save the data to. + - stations_list (list): A list of strings containing the codes of the stations to retrieve data from. - startDate (str): The start date for the measurements, in the format 'yyyy-mm-dd'. - endDate (str): The end date for the measurements, in the format 'yyyy-mm-dd'. - variables (list): A list of strings containing the names of the variables to retrieve. - - dataset (str): The name of the dataset to retrieve the data from. Default is 'controlled'. + - dataset (str): The name of the database to retrieve the data from. Default is 'controlled' alternatively 'raw' database. + - csv_file (str, optional): pass the name of the csv file to save the data otherwise it will return the dataframe. + - aggregate (bool): If True, aggregate the data per day; otherwise, return data in 5 minute interval. Returns: ----------- - df (pandas.DataFrame): A DataFrame containing the aggregated data for all stations. Raises: + ----------- + - ValueError: If stations_list is not a list. + + ### Example Usage: + To retrieve precipitation data for stations in Kenya for the last week and save it as a csv file: + ```python + # Import the necessary modules + from datetime import datetime, timedelta + from filter_stations import retreive_data + + # An instance of the retreive_data class + ret = retreive_data(apiKey, apiSecret, maps_key) + + # Get today's date + today = datetime.now() - ValueError: If stations_list is not a list. + # Calculate one week ago + last_week = today - timedelta(days=7) + + # Format date as a string + last_week_str = last_week.strftime('%Y-%m-%d') + today_str = today.strftime('%Y-%m-%d') + + # Define the list of stations you want to retrieve data from example stations in Kenya + stations = list(ret.get_stations_info(countrycode='KE')['code']) + + # Get the precipitation data for the stations in the list + variables = ['pr'] + + # retrieve the raw data for the stations, aggregate the data and save it as a csv file + dataset = 'raw' + aggregate = '1D' + csv_file = 'Kenya_precipitation_data' + + # Call the multiple_measurements method to retrieve and aggregate data + aggregated_data = ret.multiple_measurements(stations, last_week_str, + today_str, variables, + dataset, csv_file, aggregate) + ``` """ if not isinstance(stations_list, list): raise ValueError('Pass in a list') @@ -482,7 +621,7 @@ def multiple_measurements(self, stations_list, csv_file, startDate, endDate, var results = [] with tqdm(total=len(stations_list), desc='Retrieving data for stations') as pbar: for station in stations_list: - results.append(pool.apply_async(self.retrieve_data, args=(station, startDate, endDate, variables, dataset, aggregate), callback=lambda _: pbar.update(1))) + results.append(pool.apply_async(self.get_measurements, args=(station, startDate, endDate, variables, dataset, aggregate), callback=lambda _: pbar.update(1))) pool.close() pool.join() @@ -491,8 +630,11 @@ def multiple_measurements(self, stations_list, csv_file, startDate, endDate, var if len(df_stats) > 0: df = pd.concat(df_stats, axis=1) - df.to_csv(f'{csv_file}.csv') - return df + if csv_file: + df.to_csv(f'{csv_file}.csv') + return df + else: + return df except Exception as e: print(f"An error occurred: {e}") finally: @@ -565,16 +707,16 @@ def anomalies_report(self, start_date, end_date=None): Usage: ----------- To retrieve anomaly reports for a specific date range: - ``` + ```python start_date = '2023-01-01' end_date = '2023-01-31' - report_data = your_instance.anomalies_report(start_date, end_date) + report_data = ret.anomalies_report(start_date, end_date) ``` To retrieve anomaly reports for a specific date: ``` start_date = '2023-01-01' - report_data = your_instance.anomalies_report(start_date) + report_data = ret.anomalies_report(start_date) ``` """ reqUrl = "https://datahub.tahmo.org/custom/sensordx/reports" # endpoint @@ -591,15 +733,123 @@ def anomalies_report(self, start_date, end_date=None): anomalies_data = pd.DataFrame(apiRequest.json()['qualityObjects']) level_2 = anomalies_data[(anomalies_data.level == 2) & (anomalies_data.type == 'sensordx')] level_2['station_sensor'] = level_2['stationCode'] + '_' + level_2['sensorCode'] - level_2 = level_2[['startDate', 'station_sensor', 'level']] + level_2 = level_2[['startDate', 'station_sensor', 'description', 'level']] level_2.startDate = pd.to_datetime([dateutil.parser.parse(i).strftime('%Y-%m-%d') for i in level_2['startDate']]) level_2.set_index('startDate', inplace=True) - if end_date: - return level_2.loc[start_date:end_date] - else: - return level_2.loc[start_date] + level_2 = level_2.sort_index() + # print(level_2) + try: + if end_date: + return level_2.loc[start_date:end_date] + else: + return level_2.loc[start_date] + except KeyError as e: + return e + else: + return self.__handleApiError(apiRequest) + + # get the ground truth data + def ground_truth(self, start_date, end_date=None, level=3): + """ + Retrieves ground truth data for a specified date range. + + Parameters: + ----------- + - start_date (str): The start date for the report in 'yyyy-mm-dd' format. + - end_date (str, optional): The end date for the report in 'yyyy-mm-dd' format. + If not provided, only data for the start_date is returned. + + Returns: + ----------- + - pandas.DataFrame: A DataFrame containing ground truth data with columns 'startDate', + 'station_sensor', 'description' and 'level'. The 'startDate' column is used as the index. + + Raises: + ----------- + - Exception: If there's an issue with the API request. + + Usage: + ----------- + To retrieve ground truth data for a specific date range: + ```python + start_date = '2023-01-01' + end_date = '2023-01-31' + report_data = ret.ground_truth(start_date, end_date) + ``` + + To retrieve ground truth data for a specific date: + ``` + start_date = '2023-01-01' + report_data = ret.ground_truth(start_date) + ``` + """ + reqUrl = "https://datahub.tahmo.org/custom/sensordx/reports" # endpoint + # response = self.__request(reqUrl, {}) + print(f'API request: {reqUrl}') + apiRequest = requests.get(f'{reqUrl}', + params={}, + auth=requests.auth.HTTPBasicAuth( + self.apiKey, + self.apiSecret + ) + ) + if apiRequest.status_code == 200: + reports = pd.DataFrame(apiRequest.json()['qualityObjects']) + reports = reports[reports.level != 2][['startDate', 'endDate', 'stationCode', 'sensorCode', 'description', 'level']] + reports['station_sensor'] = reports.stationCode + '_' + reports.sensorCode + reports = reports.drop(['stationCode', 'sensorCode'], axis=1) + + # convert the start and end date to datetime format + reports['startDate'] = pd.to_datetime(reports['startDate']).dt.tz_localize(None) + reports['endDate'] = pd.to_datetime(reports['endDate']).dt.tz_localize(None) + # convert start_date string to datetime format + start_date_dt = pd.to_datetime(start_date).tz_localize(None) + + try: + if end_date is None: + # check for the date + def check_date(row): + if row.startDate <= start_date_dt and row.endDate >= start_date_dt: + return start_date + reports['Date'] = reports.apply(check_date, axis=1) + reports = reports.dropna() + reports = reports[['Date', 'station_sensor', 'description', 'level']] + reports.set_index('Date', inplace=True) + return reports + else: + # convert end_date string to datetime format + end_date_dt = pd.to_datetime(end_date).tz_localize(None) + + # Define a function to check if a date is within a range + def check_date(row, date): + return row['startDate'] <= date and row['endDate'] >= date + reports_list = [] + # Iterate over the date range + for single_date in pd.date_range(start_date, end_date): + # Filter the reports for the current date + filtered_reports = reports[reports.apply(check_date, axis=1, date=single_date)] + + # Add the current date as a new column + filtered_reports['Date'] = single_date + + # Append the filtered reports to the list + reports_list.append(filtered_reports) + filtered_reports_df = pd.concat(reports_list) + + # Drop the startDate and endDate columns + filtered_reports_df = filtered_reports_df.drop(['startDate', 'endDate'], axis=1) + # Set the index to the Date column + filtered_reports_df.set_index('Date', inplace=True) + return filtered_reports_df + + except KeyError as e: + return e + + + else: return self.__handleApiError(apiRequest) + ''' A specific class to evaluate and validate the water level data using TAHMO Stations To be used as it is to maintain flow @@ -623,7 +873,9 @@ def stations_within_radius(self, radius, latitude, longitude, df=False): - df (bool, optional): Flag indicating whether to return the result as a DataFrame. Defaults to False. Returns: - - DataFrame or list: DataFrame or list containing the stations within the specified radius. If df is True, a DataFrame is returned with the columns 'code', 'location.latitude', 'location.longitude', and 'distance'. If df is False, a list of station codes is returned. + - DataFrame or list: DataFrame or list containing the stations within the specified radius. If df is True, + a DataFrame is returned with the columns 'code', 'location.latitude', 'location.longitude', and 'distance'. + If df is False, a list of station codes is returned. """ stations = super().get_stations_info() @@ -716,7 +968,7 @@ def calculate_lag(self, weather_stations_data, water_level_data, lag=3, above=No def shed_stations(self, weather_stations_data, water_level_data, gauging_station_coords, radius, lag=3, - percentage=1, above=None, below=None): + percentage=1): """ Filters and processes weather station data to identify stations potentially contributing to water level changes above or below @@ -749,6 +1001,33 @@ def shed_stations(self, weather_stations_data, water_level_data, positive correlations and lagged changes above the specified threshold. - below_threshold_lag (list): List of weather stations with positive correlations and lagged changes below the specified threshold. + + Usage: + ------------ + Get the TAHMO stations that correlate with the water level data + ```python + import pandas as pd + from filter_stations import pipeline + + # An instance of the pipeline class + pipe = pipeline(apiKey, apiSecret, maps_key) + + # load the water level data and the weather stations data + water_level_data = pd.read_csv('water_level_data.csv') + weather_stations_data = pd.read_csv('weather_stations_data.csv') + + # get the coordinates of the gauging station + gauging_station_coords = (-0.416, 36.951) + + # get the stations within a radius of 200km from the gauging station + radius = 200 + + # get the stations that correlate with the water level data + above_threshold_lag, below_threshold_lag = pipe.shed_stations(weather_stations_data, water_level_data, + gauging_station_coords, radius, + lag=3, percentage=1) + ``` + """ # Filter the date range based on the water level data from first day of the water level data to the last day of the water level data weather_stations_data = weather_stations_data.loc[water_level_data.index[0]:water_level_data.index[-1]] @@ -1280,6 +1559,13 @@ def animation_grid(self, mu_pred, xi, xj, valid_station_df, clogged_station_df, Returns: ----------- - HTML: The animation as an HTML object. + + The animation as an MP4 file + + + """ fig, ax = plt.subplots() @@ -1402,6 +1688,10 @@ def get_map(self, subset_list, start_date=None, end_date=None, data_values=False -------- - my_map : folium.folium.Map A Folium map object showing the locations of the weather stations in the given subsets. + +
+ Subset Map +
""" # Read the csv file df_rainfall = pd.read_csv(csv_file) @@ -1454,7 +1744,7 @@ def get_map(self, subset_list, start_date=None, end_date=None, data_values=False # From the loaded data on the jobs scored, format the data class transform_data: - # inherit from retrieve_data class + def __init__(self, apiKey, apiSecret, api_key): super().__init__(apiKey, apiSecret, api_key) diff --git a/setup.py b/setup.py index d188172..293a97b 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='filter_stations', - version='0.4.6', + version='0.5.1', packages=find_packages(), include_package_data=True, description='Making it easier to navigate and clean TAHMO weather station data for ML development', diff --git a/water_level_pipeline.md b/water_level_pipeline.md new file mode 100644 index 0000000..f530ef6 --- /dev/null +++ b/water_level_pipeline.md @@ -0,0 +1,1851 @@ + +## Water Level Pipeline +- A series of functions to be added to the filter-stations module in pypi to evalute which TAHMO stations to use that corroborates with the water level +- All begins with the coordinates of the gauging station(location of the monitoring sensor) + + +```python +import os +from pathlib import Path +import haversine as hs +import pandas as pd +import numpy as np +import datetime +import statsmodels.api as sm +from matplotlib.dates import DateFormatter +import matplotlib.pyplot as plt +import warnings +import dateutil.parser +warnings.filterwarnings('ignore') + +# config_path +config_path = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'config.json') +``` + + +```python +from filter_stations import retreive_data, Interactive_maps, Filter, pipeline +import json +# Authentication +with open(config_path) as f: + conf = json.load(f) + +apiKey = conf['apiKey'] +apiSecret = conf['apiSecret'] +map_api_key = conf['map_api_key'] +fs = retreive_data(apiKey, apiSecret, map_api_key) +pipe = pipeline(apiKey, apiSecret, map_api_key) +maps = Interactive_maps(apiKey, apiSecret, map_api_key) +``` + +### Loading data +Load the water level data from the github repository[Link here]
+Load the TAHMO station data from the [Link here]
+ + +```python +# muringato +muringato_loc = [-0.406689, 36.96301] +# ewaso +ewaso_loc = [0.026833, 36.914637] + +# Weather stations data +weather_stations_data = pd.read_csv(os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'stations_precipitation.csv')) + +''' The water level data ''' +# muringato data sensor 2 2021 +muringato_data_s2_2021 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water_data_2021', 'muringato-sensor2.csv') + +# muringato data sensor 2 2022 +muringato_data_s2_2022 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water_data_2021', 'muringato-sensor2-2022.csv') + +# muringato data sensor 6 2021 +muringato_data_s6_2021 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water_data_2021', 'muringato-sensor6.csv') + +# muringato data sensor 6 2022 +muringato_data_s6_2022 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water_data_2021', 'muringato-sensor6-2022.csv') + + +# ewaso data sensor 2020 convert the time column to datetime +ewaso_data_2020 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water-level-data-ewaso', '1E2020.csv') + +# ewaso data sensor 2022 +ewaso_data_2022 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water-level-data-ewaso', '1E2022.csv') + +weather_stations_data.Date = weather_stations_data.Date.astype('datetime64[ns]') +weather_stations_data.set_index('Date', inplace=True) + +``` + +To format water level it needs to have a time column and water level column the names can be different but the order must be that + + +```python +# handle the water level data +def format_water_level(water_level_data_path): + # data needs to be in the format time, data/water_level or whatever the column is called + water_level_data = pd.read_csv(water_level_data_path) + # rename the first column to time + water_level_data.rename(columns={water_level_data.columns[0]: 'time'}, inplace=True) + # convert the time column to datetime + water_level_data.time = pd.to_datetime([dateutil.parser.parse(i).strftime('%d-%m-%Y') for i in water_level_data['time']]) + water_level_data.time = water_level_data.time.astype('datetime64[ns]') + # rename the column to water_level + water_level_data.rename(columns={water_level_data.columns[1]: 'water_level'}, inplace=True) + # set the time column as the index + water_level_data.set_index('time', inplace=True) + return water_level_data +``` + + +```python +muringato_data_s2_2021 = format_water_level(muringato_data_s2_2021) +muringato_data_s2_2022 = format_water_level(muringato_data_s2_2022) +muringato_data_s6_2021 = format_water_level(muringato_data_s6_2021) +muringato_data_s6_2022 = format_water_level(muringato_data_s6_2022) +ewaso_data_2020 = format_water_level(ewaso_data_2020) +ewaso_data_2022 = format_water_level(ewaso_data_2022) + +``` + +1. Filter the date range based on the water level data from first day of the water level data to the last day of the water level data +2. Choose stations within a certain radius of the gauging station 100 km for example get the resulting weather data +3. Get the stations with only 100 percent data no missing data +4. Remove the stations data with the value zero from beginning to end if the water level data has some values above zero +5. Calculate the correlation between the water level data and the weather data needs to be above 0 and have a lag of maximum 3 days +6. Plot the resulting figures + + +### Choosing ewaso 2020 range +removing stations with missing data reduces from 1035 to 849 columns
+removing all zeros reduces from 849 to 604 columns
+columns with positive correlation reduces the number from 604 columns to 283 columns
+checking for lag reduces the columns to 80 + + +```python +above, below = pipe.shed_stations(weather_stations_data, + muringato_data_s6_2022, + muringato_loc, + 100, + lag=3 + ) + +``` + + +```python +below_stations = [i.split('_')[0] for i in below.keys()] +print(below_stations) +below_stations_metadata = fs.get_stations_info(multipleStations=below_stations)[['code', 'location.latitude', 'location.longitude']] +``` + + ['TA00001', 'TA00023', 'TA00024', 'TA00025', 'TA00054', 'TA00056', 'TA00067', 'TA00077', 'TA00129', 'TA00147', 'TA00154', 'TA00155', 'TA00156', 'TA00166', 'TA00171', 'TA00189', 'TA00215', 'TA00222', 'TA00228', 'TA00230', 'TA00233', 'TA00250', 'TA00270', 'TA00270', 'TA00272', 'TA00272', 'TA00316', 'TA00317', 'TA00355', 'TA00459', 'TA00473', 'TA00480', 'TA00493', 'TA00494', 'TA00577', 'TA00601', 'TA00621', 'TA00653', 'TA00672', 'TA00676', 'TA00679', 'TA00692', 'TA00699', 'TA00704', 'TA00705', 'TA00711', 'TA00712', 'TA00712', 'TA00715', 'TA00717', 'TA00750', 'TA00751', 'TA00767'] + + + +```python +below_stations_metadata['distance']= below_stations_metadata.apply(lambda row: hs.haversine((muringato_loc[0], + muringato_loc[1]), (row['location.latitude'], + row['location.longitude'])), axis=1) +below_stations_metadata.sort_values(by='distance') +``` + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
codelocation.latitudelocation.longitudedistance
52TA00056-0.72165637.14558540.480889
22TA00024-1.07173137.04557874.517013
150TA00166-0.31950837.65913978.009238
172TA00189-0.79526037.66593089.304790
230TA00250-0.77894037.67673889.504935
600TA00715-1.22561836.80906592.655456
565TA00679-1.27083536.72391699.698089
23TA00025-1.30183936.760200102.058383
422TA00473-0.51237135.956813112.495996
513TA00621-1.63302037.146185137.874253
51TA00054-0.23934235.728897138.480985
424TA00480-1.37615237.797646142.238019
61TA00067-1.79428537.621211170.765765
140TA00156-1.70112338.068339189.255406
71TA00077-0.38306635.068406210.682047
139TA00155-2.52303736.829437235.795373
21TA00023-2.38855038.040767250.831198
155TA00171-0.00271034.596908266.903936
291TA003170.04044034.371716292.394991
0TA00001-1.12328334.397992296.112467
652TA00767-2.67199038.369665296.467402
290TA003160.28986234.371222298.418648
131TA001470.44927434.282303312.905564
117TA00129-3.39092637.717656342.264311
138TA00154-4.23110737.847804436.466702
211TA002301.72469033.622000440.623881
329TA003553.49806935.843897451.651266
544TA006530.26506232.627203487.869319
196TA002150.05246532.440690505.441217
203TA002221.18624032.020330577.409865
584TA00699-0.70757031.402138619.216128
558TA00672-6.18030237.146832642.321296
597TA00712-6.67630839.131552737.484276
562TA00676-6.78037438.973512742.978650
635TA00750-6.80531639.139843751.347364
636TA00751-6.84866839.082174753.892793
432TA00494-6.83386039.167475755.338586
248TA00270-6.84239039.156760755.852180
250TA00272-6.89003939.117927759.501414
431TA00493-6.91084539.075597760.236606
214TA002333.45350031.251250766.277105
209TA002283.40472030.959600790.422401
498TA00601-14.08014833.9075931557.147407
602TA007173.89830511.8864372827.236339
590TA007054.9522518.3416923234.191975
481TA0057710.4871479.7882233240.086078
589TA007045.3786026.9982923388.907422
596TA007114.9065306.9170643389.011984
410TA004599.0661486.5690803526.820348
577TA006926.4041145.6263073559.025765
+
+ + + + +```python +# Interactive visuals +import plotly.express as px +import plotly.graph_objects as go + +fig = px.scatter_mapbox(below_stations_metadata, + lat="location.latitude", + lon="location.longitude", + hover_name="code", + hover_data=["distance"], + color_discrete_sequence=["fuchsia"], + zoom=8, + height=800, + ) +# update marker size +fig.update_traces(marker=dict(size=10)) +# add a point for the central station +fig.add_trace(go.Scattermapbox( + lat=[muringato_loc[0]], + lon=[muringato_loc[1]], + mode='markers', + marker=go.scattermapbox.Marker( + size=14 + ), + text=['Muringato gauging station'], + )) + +fig.update_layout( + mapbox_style="carto-positron", + margin={"r":0,"t":0,"l":0,"b":0}, + showlegend=False +) +fig.show() +``` + + + + +```python +pipe.plot_figs( + weather_stations_data, + list(muringato_data_s6_2022['water_level']), + list(below.keys()), + date=dateutil.parser.parse(str(muringato_data_s6_2022.index[0])).strftime('%d-%m-%Y'), + save=False +) +``` + + Begin plotting! + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_1.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_2.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_3.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_4.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_5.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_6.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_7.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_8.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_9.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_10.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_11.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_12.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_13.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_14.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_15.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_16.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_17.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_18.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_19.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_20.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_21.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_22.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_23.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_24.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_25.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_26.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_27.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_28.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_29.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_30.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_31.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_32.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_33.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_34.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_35.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_36.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_37.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_38.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_39.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_40.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_41.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_42.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_43.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_44.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_45.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_46.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_47.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_48.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_49.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_50.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_51.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_52.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_15_53.png) + + + + +```python +RADIUS = 100 + +ewaso_weather_data_2020 = weather_stations_data.loc[ewaso_data_2020.index[0]:ewaso_data_2020.index[-1]] +# ewaso stations within a particular radius +ewaso_tahmo_stations_2020 = pipe.stations_within_radius(RADIUS, ewaso_loc[0], ewaso_loc[1], df=False) +# Get stations without missing data +# ewaso weather data +ewaso_weather_data_2020_filtered = pipe.stations_data_check(stations_list=list(ewaso_tahmo_stations_2020), + percentage=1, data=ewaso_weather_data_2020 + ) +# Check the sum of each column and drop columns with a sum of zero this is if the sum of water level is not equal to zero +ewaso_weather_data_2020_filtered = ewaso_weather_data_2020_filtered.loc[:, ewaso_weather_data_2020_filtered.sum() != 0] +``` + + API request: services/assets/v2/stations + + + +```python +import statsmodels.api as sm +def calculate_lag(weather_stations_data, water_level_data, lag=3, above=None, below=None): + above_threshold_lag = dict() + below_threshold_lag = dict() + for cols in weather_stations_data.columns: + # check for positive correlation if not skip the column + if weather_stations_data[cols].corr(water_level_data['water_level']) <= 0: + continue + # get the lag and the coefficient for columns with a positive correlation + coefficient_list = list(sm.tsa.stattools.ccf(weather_stations_data[cols], water_level_data['water_level'])) + a = np.argmax(coefficient_list) + b = coefficient_list[a] + # print(f'{cols} has a lag of {a}') + # print(f'{cols} has a coefficient of {b}') + # print('-----------------------') + if a > lag: + above_threshold_lag[cols] = a + elif a <= lag: + below_threshold_lag[cols] = a + if above: + return above_threshold_lag + elif below: + return below_threshold_lag + else: + return above_threshold_lag, below_threshold_lag + + +``` + +Bringing all the functions together to create a pipeline + + +```python +def shed_stations(weather_stations_data, water_level_data, + gauging_station_coords, radius, lag=3, + percentage=1, above=None, below=None): + # Filter the date range based on the water level data from first day of the water level data to the last day of the water level data + weather_stations_data = weather_stations_data.loc[water_level_data.index[0]:water_level_data.index[-1]] + # Filter the weather stations based on the radius + lat, lon = gauging_station_coords[0], gauging_station_coords[1] + weather_stations_data_list = pipe.stations_within_radius(radius, lat, lon, df=False) + # get stations without missing data or the percentage of stations with missing data + weather_stations_data_filtered = pipe.stations_data_check(stations_list=weather_stations_data_list, + percentage=percentage, + data=weather_stations_data) + # Check the sum of each column and drop columns with a sum of zero this is if the sum of water level is not equal to zero + weather_stations_data_filtered = weather_stations_data_filtered.loc[:, weather_stations_data_filtered.sum() != 0] + + # Filter the weather stations based on the lag and positive correlation + above_threshold_lag, below_threshold_lag = calculate_lag(weather_stations_data_filtered, water_level_data, lag=lag) + + return above_threshold_lag, below_threshold_lag +``` + + +```python +above_threshold_lag, below_threshold_lag = shed_stations(weather_stations_data, ewaso_data_2020, ewaso_loc, RADIUS, lag=3, percentage=1, above=True, below=False) +len(below_threshold_lag) +``` + + API request: services/assets/v2/stations + + + + + + 80 + + + +### Plot the figures + + +```python +pipe.plot_figs( + weather_stations_data, + list(ewaso_data_2020['water_level']), + list(below_threshold_lag.keys()), + date=dateutil.parser.parse(str(ewaso_data_2020.index[0])).strftime('%d-%m-%Y'), + save=True +) +``` + + Begin plotting! + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_1.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_2.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_3.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_4.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_5.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_6.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_7.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_8.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_9.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_10.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_11.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_12.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_13.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_14.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_15.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_16.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_17.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_18.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_19.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_20.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_21.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_22.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_23.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_24.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_25.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_26.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_27.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_28.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_29.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_30.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_31.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_32.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_33.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_34.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_35.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_36.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_37.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_38.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_39.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_40.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_41.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_42.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_43.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_44.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_45.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_46.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_47.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_48.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_49.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_50.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_51.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_52.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_53.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_54.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_55.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_56.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_57.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_58.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_59.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_60.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_61.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_62.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_63.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_64.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_65.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_66.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_67.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_68.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_69.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_70.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_71.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_72.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_73.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_74.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_75.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_76.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_77.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_78.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_79.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_22_80.png) + + + +Input water level data
+Input TAHMO station data
+ + + +```python +# plot the two with different colors +fig, ax = plt.subplots(figsize=(10, 10)) +muringato_tahmo_stations.plot(kind='scatter', + x='location.longitude', + y='location.latitude', + color='blue', + alpha=0.7, + ax=ax) +ewaso_tahmo_stations.plot(kind='scatter', + x='location.longitude', + y='location.latitude', + color='red', + alpha=0.7, + ax=ax) +plt.show() +``` + + + +![png](water_level_pipeline_files/water_level_pipeline_24_0.png) + + + +Apart from the completeness another method of validation by eliminating unusable sensors is checking for a positive correlation and lag +- The default lag is 3 days between a particular station and the gauging station +- The required format is a timeseries data +- Provide the column names for evaluation format = [Date, data] +- with the change in parameters one can choose above or below threshold + + +```python +def plot_figs(weather_stations, water_list, threshold_list, save=False, dpi=500, date='11-02-2021'): + start_date = datetime.datetime.strptime(date, "%d-%m-%Y") + end_date = start_date + datetime.timedelta(len(water_list)-1) + # weather_stations = weather_stations.set_index('Date') + df_plot = weather_stations[start_date:end_date] + df_plot = df_plot[threshold_list].reset_index() + df_plot.rename(columns={'index':'Date'}, inplace=True) + + + plt.rcParams['figure.figsize'] = (15, 9) + print('Begin plotting!') + + for cols in df_plot.columns[1:]: + fig, ax1 = plt.subplots() + color = 'tab:blue' + ax1.set_xlabel(f'Time', fontsize=24, weight='bold') + ax1.set_ylabel(f'Rainfall {cols} (mm)', color=color, fontsize=24, weight='bold') + ax1.bar(pd.to_datetime(df_plot['Date'], format="%d/%m/%Y"), df_plot[f'{cols}'], color=color, width=4, alpha=1.0) + ax1.tick_params(axis='y', labelcolor=color, labelsize=24) + ax1.tick_params(axis='x') + ax1.set_xticklabels(df_plot['Date'], fontsize=21, weight='bold') + ax1.grid(color='gray', linestyle='--', linewidth=0.8) + ax1.set(facecolor="white") + ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis + + color = 'tab:red' + ax2.set_ylabel('Water level/Stage (m)', color=color, fontsize=24, weight='bold') + ax2.plot(pd.to_datetime(df_plot['Date'], format="%d/%m/%Y"), water_list, color=color, linewidth=4) + ax2.tick_params(axis='y', labelcolor=color, labelsize=24) + ax2.set(facecolor="white") + plt.title('Stage and Rainfall against Time', fontsize=22, weight='bold') + + date_form = DateFormatter("%m-%y") + ax1.xaxis.set_major_formatter(date_form) + fig.tight_layout() + + if save: + fig.savefig(f'{cols}.png', dpi=dpi) + +``` + + +```python +plot_figs(stations_df, lag_[list(lag_.keys())[0]]['water_list'], list(lag_.keys()), save=True, date='12-05-2020') +``` + + Begin plotting! + + + + +![png](water_level_pipeline_files/water_level_pipeline_27_1.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_27_2.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_27_3.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_27_4.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_27_5.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_27_6.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_27_7.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_27_8.png) + + + + + +![png](water_level_pipeline_files/water_level_pipeline_27_9.png) + + + +Format to get the stations maetadata + + +```python +def filter_metadata(lag_keys): + captured_list = [i.split('_')[0] for i in list(lag_keys)] + return fs.get_stations_info(multipleStations=captured_list) +``` + + +```python +filter_metadata(list(lag_.keys())) +``` + + API request: services/assets/v2/stations + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
codestatusinstallationdateelevationgroundsensorinstallationsdataloggerinstallationscreatoridcreatedupdateridupdated...location.countrycodelocation.zipcodelocation.latitudelocation.longitudelocation.elevationmsllocation.notelocation.creatoridlocation.createdlocation.updateridlocation.updated
26TA0002812015-08-31T00:00:00Z9.0NoneNone22018-12-11T08:35:17.888233Z22018-12-11T08:35:17.888233Z...KE0.05521937.1367472003.6{}22018-10-26T13:32:16.15537Z372022-06-30T11:11:50.27135Z
27TA0002912015-09-02T00:00:00Z2.0NoneNone22018-12-11T08:36:19.30342Z22018-12-11T08:36:19.30342Z...KE-0.50077636.5875112545.8{}22018-10-26T13:33:31.451613Z372022-02-28T12:25:09.578242Z
53TA0005712015-10-08T00:00:00Z2.0NoneNone22018-12-11T09:21:29.092833Z22018-12-11T09:21:29.092833Z...KE-1.25303036.8564871645.3{}22018-10-29T09:13:33.768613Z22022-07-26T07:34:06.603938Z
68TA0007412015-11-19T00:00:00Z2.0NoneNone22018-12-11T09:38:25.742397Z22018-12-11T09:38:25.742397Z...KE-0.56608037.0744121726.8{}22018-10-29T10:35:28.49617Z22022-07-26T07:38:42.100985Z
74TA0008012016-01-28T00:00:00Z2.0NoneNone22018-12-11T09:43:10.523398Z22018-12-11T09:43:10.523398Z...KE-1.08758936.8184021777.3{}22018-10-29T10:53:47.845042Z372022-02-28T13:07:04.709903Z
150TA0016612017-05-11T00:00:00Z2.0NoneNone22018-12-12T08:29:28.10697Z22018-12-12T08:29:28.10697Z...KE-0.31950837.6591391404.0{}22018-11-10T08:47:37.949135Z22018-11-10T08:47:37.949135Z
+

6 rows × 28 columns

+
+ + + + +```python + +```