diff --git a/README.md b/README.md
index 56db659..66708dc 100644
--- a/README.md
+++ b/README.md
@@ -1,1858 +1,15 @@
+
## Documentation
+You can find the documentation for the project by following this link
https://filter-stations.netlify.app/
-## Installation
-```
-pip install filter-stations
-```
-
-## Water Level Pipeline
-- A series of functions to be added to the filter-stations module in pypi to evalute which TAHMO stations to use that corroborates with the water level
-- All begins with the coordinates of the gauging station(location of the monitoring sensor)
-
-
-```python
-import os
-from pathlib import Path
-import haversine as hs
-import pandas as pd
-import numpy as np
-import datetime
-import statsmodels.api as sm
-from matplotlib.dates import DateFormatter
-import matplotlib.pyplot as plt
-import warnings
-import dateutil.parser
-warnings.filterwarnings('ignore')
-
-# config_path
-config_path = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'config.json')
-```
-
-
-```python
-from filter_stations import retreive_data, Interactive_maps, Filter, pipeline
-import json
-# Authentication
-with open(config_path) as f:
- conf = json.load(f)
-
-apiKey = conf['apiKey']
-apiSecret = conf['apiSecret']
-map_api_key = conf['map_api_key']
-fs = retreive_data(apiKey, apiSecret, map_api_key)
-pipe = pipeline(apiKey, apiSecret, map_api_key)
-maps = Interactive_maps(apiKey, apiSecret, map_api_key)
-```
-
-### Loading data
-Load the water level data from the github repository[Link here]
-Load the TAHMO station data from the [Link here]
-
-
-```python
-# muringato
-muringato_loc = [-0.406689, 36.96301]
-# ewaso
-ewaso_loc = [0.026833, 36.914637]
-
-# Weather stations data
-weather_stations_data = pd.read_csv(os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'stations_precipitation.csv'))
-
-''' The water level data '''
-# muringato data sensor 2 2021
-muringato_data_s2_2021 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water_data_2021', 'muringato-sensor2.csv')
-
-# muringato data sensor 2 2022
-muringato_data_s2_2022 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water_data_2021', 'muringato-sensor2-2022.csv')
-
-# muringato data sensor 6 2021
-muringato_data_s6_2021 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water_data_2021', 'muringato-sensor6.csv')
-
-# muringato data sensor 6 2022
-muringato_data_s6_2022 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water_data_2021', 'muringato-sensor6-2022.csv')
-
-
-# ewaso data sensor 2020 convert the time column to datetime
-ewaso_data_2020 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water-level-data-ewaso', '1E2020.csv')
-
-# ewaso data sensor 2022
-ewaso_data_2022 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water-level-data-ewaso', '1E2022.csv')
-
-weather_stations_data.Date = weather_stations_data.Date.astype('datetime64[ns]')
-weather_stations_data.set_index('Date', inplace=True)
-
-```
-
-To format water level it needs to have a time column and water level column the names can be different but the order must be that
-
-
-```python
-# handle the water level data
-def format_water_level(water_level_data_path):
- # data needs to be in the format time, data/water_level or whatever the column is called
- water_level_data = pd.read_csv(water_level_data_path)
- # rename the first column to time
- water_level_data.rename(columns={water_level_data.columns[0]: 'time'}, inplace=True)
- # convert the time column to datetime
- water_level_data.time = pd.to_datetime([dateutil.parser.parse(i).strftime('%d-%m-%Y') for i in water_level_data['time']])
- water_level_data.time = water_level_data.time.astype('datetime64[ns]')
- # rename the column to water_level
- water_level_data.rename(columns={water_level_data.columns[1]: 'water_level'}, inplace=True)
- # set the time column as the index
- water_level_data.set_index('time', inplace=True)
- return water_level_data
-```
-
-
-```python
-muringato_data_s2_2021 = format_water_level(muringato_data_s2_2021)
-muringato_data_s2_2022 = format_water_level(muringato_data_s2_2022)
-muringato_data_s6_2021 = format_water_level(muringato_data_s6_2021)
-muringato_data_s6_2022 = format_water_level(muringato_data_s6_2022)
-ewaso_data_2020 = format_water_level(ewaso_data_2020)
-ewaso_data_2022 = format_water_level(ewaso_data_2022)
-
-```
-
-1. Filter the date range based on the water level data from first day of the water level data to the last day of the water level data
-2. Choose stations within a certain radius of the gauging station 100 km for example get the resulting weather data
-3. Get the stations with only 100 percent data no missing data
-4. Remove the stations data with the value zero from beginning to end if the water level data has some values above zero
-5. Calculate the correlation between the water level data and the weather data needs to be above 0 and have a lag of maximum 3 days
-6. Plot the resulting figures
-
-
-### Choosing ewaso 2020 range
-removing stations with missing data reduces from 1035 to 849 columns
-removing all zeros reduces from 849 to 604 columns
-columns with positive correlation reduces the number from 604 columns to 283 columns
-checking for lag reduces the columns to 80
-
-
-```python
-above, below = pipe.shed_stations(weather_stations_data,
- muringato_data_s6_2022,
- muringato_loc,
- 100,
- lag=3
- )
-
-```
-
-
-```python
-below_stations = [i.split('_')[0] for i in below.keys()]
-print(below_stations)
-below_stations_metadata = fs.get_stations_info(multipleStations=below_stations)[['code', 'location.latitude', 'location.longitude']]
-```
-
- ['TA00001', 'TA00023', 'TA00024', 'TA00025', 'TA00054', 'TA00056', 'TA00067', 'TA00077', 'TA00129', 'TA00147', 'TA00154', 'TA00155', 'TA00156', 'TA00166', 'TA00171', 'TA00189', 'TA00215', 'TA00222', 'TA00228', 'TA00230', 'TA00233', 'TA00250', 'TA00270', 'TA00270', 'TA00272', 'TA00272', 'TA00316', 'TA00317', 'TA00355', 'TA00459', 'TA00473', 'TA00480', 'TA00493', 'TA00494', 'TA00577', 'TA00601', 'TA00621', 'TA00653', 'TA00672', 'TA00676', 'TA00679', 'TA00692', 'TA00699', 'TA00704', 'TA00705', 'TA00711', 'TA00712', 'TA00712', 'TA00715', 'TA00717', 'TA00750', 'TA00751', 'TA00767']
-
-
-
-```python
-below_stations_metadata['distance']= below_stations_metadata.apply(lambda row: hs.haversine((muringato_loc[0],
- muringato_loc[1]), (row['location.latitude'],
- row['location.longitude'])), axis=1)
-below_stations_metadata.sort_values(by='distance')
-```
-
-
-
-
-
-
-
-
-
-
-
code
-
location.latitude
-
location.longitude
-
distance
-
-
-
-
-
52
-
TA00056
-
-0.721656
-
37.145585
-
40.480889
-
-
-
22
-
TA00024
-
-1.071731
-
37.045578
-
74.517013
-
-
-
150
-
TA00166
-
-0.319508
-
37.659139
-
78.009238
-
-
-
172
-
TA00189
-
-0.795260
-
37.665930
-
89.304790
-
-
-
230
-
TA00250
-
-0.778940
-
37.676738
-
89.504935
-
-
-
600
-
TA00715
-
-1.225618
-
36.809065
-
92.655456
-
-
-
565
-
TA00679
-
-1.270835
-
36.723916
-
99.698089
-
-
-
23
-
TA00025
-
-1.301839
-
36.760200
-
102.058383
-
-
-
422
-
TA00473
-
-0.512371
-
35.956813
-
112.495996
-
-
-
513
-
TA00621
-
-1.633020
-
37.146185
-
137.874253
-
-
-
51
-
TA00054
-
-0.239342
-
35.728897
-
138.480985
-
-
-
424
-
TA00480
-
-1.376152
-
37.797646
-
142.238019
-
-
-
61
-
TA00067
-
-1.794285
-
37.621211
-
170.765765
-
-
-
140
-
TA00156
-
-1.701123
-
38.068339
-
189.255406
-
-
-
71
-
TA00077
-
-0.383066
-
35.068406
-
210.682047
-
-
-
139
-
TA00155
-
-2.523037
-
36.829437
-
235.795373
-
-
-
21
-
TA00023
-
-2.388550
-
38.040767
-
250.831198
-
-
-
155
-
TA00171
-
-0.002710
-
34.596908
-
266.903936
-
-
-
291
-
TA00317
-
0.040440
-
34.371716
-
292.394991
-
-
-
0
-
TA00001
-
-1.123283
-
34.397992
-
296.112467
-
-
-
652
-
TA00767
-
-2.671990
-
38.369665
-
296.467402
-
-
-
290
-
TA00316
-
0.289862
-
34.371222
-
298.418648
-
-
-
131
-
TA00147
-
0.449274
-
34.282303
-
312.905564
-
-
-
117
-
TA00129
-
-3.390926
-
37.717656
-
342.264311
-
-
-
138
-
TA00154
-
-4.231107
-
37.847804
-
436.466702
-
-
-
211
-
TA00230
-
1.724690
-
33.622000
-
440.623881
-
-
-
329
-
TA00355
-
3.498069
-
35.843897
-
451.651266
-
-
-
544
-
TA00653
-
0.265062
-
32.627203
-
487.869319
-
-
-
196
-
TA00215
-
0.052465
-
32.440690
-
505.441217
-
-
-
203
-
TA00222
-
1.186240
-
32.020330
-
577.409865
-
-
-
584
-
TA00699
-
-0.707570
-
31.402138
-
619.216128
-
-
-
558
-
TA00672
-
-6.180302
-
37.146832
-
642.321296
-
-
-
597
-
TA00712
-
-6.676308
-
39.131552
-
737.484276
-
-
-
562
-
TA00676
-
-6.780374
-
38.973512
-
742.978650
-
-
-
635
-
TA00750
-
-6.805316
-
39.139843
-
751.347364
-
-
-
636
-
TA00751
-
-6.848668
-
39.082174
-
753.892793
-
-
-
432
-
TA00494
-
-6.833860
-
39.167475
-
755.338586
-
-
-
248
-
TA00270
-
-6.842390
-
39.156760
-
755.852180
-
-
-
250
-
TA00272
-
-6.890039
-
39.117927
-
759.501414
-
-
-
431
-
TA00493
-
-6.910845
-
39.075597
-
760.236606
-
-
-
214
-
TA00233
-
3.453500
-
31.251250
-
766.277105
-
-
-
209
-
TA00228
-
3.404720
-
30.959600
-
790.422401
-
-
-
498
-
TA00601
-
-14.080148
-
33.907593
-
1557.147407
-
-
-
602
-
TA00717
-
3.898305
-
11.886437
-
2827.236339
-
-
-
590
-
TA00705
-
4.952251
-
8.341692
-
3234.191975
-
-
-
481
-
TA00577
-
10.487147
-
9.788223
-
3240.086078
-
-
-
589
-
TA00704
-
5.378602
-
6.998292
-
3388.907422
-
-
-
596
-
TA00711
-
4.906530
-
6.917064
-
3389.011984
-
-
-
410
-
TA00459
-
9.066148
-
6.569080
-
3526.820348
-
-
-
577
-
TA00692
-
6.404114
-
5.626307
-
3559.025765
-
-
-
-
-
-
-
-
-```python
-# Interactive visuals
-import plotly.express as px
-import plotly.graph_objects as go
-
-fig = px.scatter_mapbox(below_stations_metadata,
- lat="location.latitude",
- lon="location.longitude",
- hover_name="code",
- hover_data=["distance"],
- color_discrete_sequence=["fuchsia"],
- zoom=8,
- height=800,
- )
-# update marker size
-fig.update_traces(marker=dict(size=10))
-# add a point for the central station
-fig.add_trace(go.Scattermapbox(
- lat=[muringato_loc[0]],
- lon=[muringato_loc[1]],
- mode='markers',
- marker=go.scattermapbox.Marker(
- size=14
- ),
- text=['Muringato gauging station'],
- ))
-
-fig.update_layout(
- mapbox_style="carto-positron",
- margin={"r":0,"t":0,"l":0,"b":0},
- showlegend=False
-)
-fig.show()
-```
-
-
-
-
-```python
-pipe.plot_figs(
- weather_stations_data,
- list(muringato_data_s6_2022['water_level']),
- list(below.keys()),
- date=dateutil.parser.parse(str(muringato_data_s6_2022.index[0])).strftime('%d-%m-%Y'),
- save=False
-)
-```
-
- Begin plotting!
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_1.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_2.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_3.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_4.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_5.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_6.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_7.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_8.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_9.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_10.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_11.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_12.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_13.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_14.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_15.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_16.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_17.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_18.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_19.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_20.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_21.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_22.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_23.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_24.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_25.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_26.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_27.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_28.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_29.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_30.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_31.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_32.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_33.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_34.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_35.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_36.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_37.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_38.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_39.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_40.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_41.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_42.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_43.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_44.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_45.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_46.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_47.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_48.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_49.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_50.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_51.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_52.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_15_53.png)
-
-
-
-
-```python
-RADIUS = 100
-
-ewaso_weather_data_2020 = weather_stations_data.loc[ewaso_data_2020.index[0]:ewaso_data_2020.index[-1]]
-# ewaso stations within a particular radius
-ewaso_tahmo_stations_2020 = pipe.stations_within_radius(RADIUS, ewaso_loc[0], ewaso_loc[1], df=False)
-# Get stations without missing data
-# ewaso weather data
-ewaso_weather_data_2020_filtered = pipe.stations_data_check(stations_list=list(ewaso_tahmo_stations_2020),
- percentage=1, data=ewaso_weather_data_2020
- )
-# Check the sum of each column and drop columns with a sum of zero this is if the sum of water level is not equal to zero
-ewaso_weather_data_2020_filtered = ewaso_weather_data_2020_filtered.loc[:, ewaso_weather_data_2020_filtered.sum() != 0]
-```
-
- API request: services/assets/v2/stations
-
-
-
-```python
-import statsmodels.api as sm
-def calculate_lag(weather_stations_data, water_level_data, lag=3, above=None, below=None):
- above_threshold_lag = dict()
- below_threshold_lag = dict()
- for cols in weather_stations_data.columns:
- # check for positive correlation if not skip the column
- if weather_stations_data[cols].corr(water_level_data['water_level']) <= 0:
- continue
- # get the lag and the coefficient for columns with a positive correlation
- coefficient_list = list(sm.tsa.stattools.ccf(weather_stations_data[cols], water_level_data['water_level']))
- a = np.argmax(coefficient_list)
- b = coefficient_list[a]
- # print(f'{cols} has a lag of {a}')
- # print(f'{cols} has a coefficient of {b}')
- # print('-----------------------')
- if a > lag:
- above_threshold_lag[cols] = a
- elif a <= lag:
- below_threshold_lag[cols] = a
- if above:
- return above_threshold_lag
- elif below:
- return below_threshold_lag
- else:
- return above_threshold_lag, below_threshold_lag
-
-
-```
-
-Bringing all the functions together to create a pipeline
-
-
-```python
-def shed_stations(weather_stations_data, water_level_data,
- gauging_station_coords, radius, lag=3,
- percentage=1, above=None, below=None):
- # Filter the date range based on the water level data from first day of the water level data to the last day of the water level data
- weather_stations_data = weather_stations_data.loc[water_level_data.index[0]:water_level_data.index[-1]]
- # Filter the weather stations based on the radius
- lat, lon = gauging_station_coords[0], gauging_station_coords[1]
- weather_stations_data_list = pipe.stations_within_radius(radius, lat, lon, df=False)
- # get stations without missing data or the percentage of stations with missing data
- weather_stations_data_filtered = pipe.stations_data_check(stations_list=weather_stations_data_list,
- percentage=percentage,
- data=weather_stations_data)
- # Check the sum of each column and drop columns with a sum of zero this is if the sum of water level is not equal to zero
- weather_stations_data_filtered = weather_stations_data_filtered.loc[:, weather_stations_data_filtered.sum() != 0]
-
- # Filter the weather stations based on the lag and positive correlation
- above_threshold_lag, below_threshold_lag = calculate_lag(weather_stations_data_filtered, water_level_data, lag=lag)
-
- return above_threshold_lag, below_threshold_lag
-```
-
-
-```python
-above_threshold_lag, below_threshold_lag = shed_stations(weather_stations_data, ewaso_data_2020, ewaso_loc, RADIUS, lag=3, percentage=1, above=True, below=False)
-len(below_threshold_lag)
-```
-
- API request: services/assets/v2/stations
-
-
-
-
-
- 80
-
-
-
-### Plot the figures
-
-
-```python
-pipe.plot_figs(
- weather_stations_data,
- list(ewaso_data_2020['water_level']),
- list(below_threshold_lag.keys()),
- date=dateutil.parser.parse(str(ewaso_data_2020.index[0])).strftime('%d-%m-%Y'),
- save=True
-)
-```
-
- Begin plotting!
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_1.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_2.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_3.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_4.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_5.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_6.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_7.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_8.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_9.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_10.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_11.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_12.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_13.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_14.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_15.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_16.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_17.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_18.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_19.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_20.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_21.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_22.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_23.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_24.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_25.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_26.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_27.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_28.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_29.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_30.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_31.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_32.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_33.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_34.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_35.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_36.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_37.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_38.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_39.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_40.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_41.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_42.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_43.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_44.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_45.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_46.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_47.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_48.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_49.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_50.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_51.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_52.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_53.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_54.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_55.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_56.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_57.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_58.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_59.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_60.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_61.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_62.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_63.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_64.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_65.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_66.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_67.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_68.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_69.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_70.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_71.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_72.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_73.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_74.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_75.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_76.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_77.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_78.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_79.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_22_80.png)
-
-
-
-Input water level data
-Input TAHMO station data
-
-
-
-```python
-# plot the two with different colors
-fig, ax = plt.subplots(figsize=(10, 10))
-muringato_tahmo_stations.plot(kind='scatter',
- x='location.longitude',
- y='location.latitude',
- color='blue',
- alpha=0.7,
- ax=ax)
-ewaso_tahmo_stations.plot(kind='scatter',
- x='location.longitude',
- y='location.latitude',
- color='red',
- alpha=0.7,
- ax=ax)
-plt.show()
-```
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_24_0.png)
-
-
-
-Apart from the completeness another method of validation by eliminating unusable sensors is checking for a positive correlation and lag
-- The default lag is 3 days between a particular station and the gauging station
-- The required format is a timeseries data
-- Provide the column names for evaluation format = [Date, data]
-- with the change in parameters one can choose above or below threshold
-
-
-```python
-def plot_figs(weather_stations, water_list, threshold_list, save=False, dpi=500, date='11-02-2021'):
- start_date = datetime.datetime.strptime(date, "%d-%m-%Y")
- end_date = start_date + datetime.timedelta(len(water_list)-1)
- # weather_stations = weather_stations.set_index('Date')
- df_plot = weather_stations[start_date:end_date]
- df_plot = df_plot[threshold_list].reset_index()
- df_plot.rename(columns={'index':'Date'}, inplace=True)
-
-
- plt.rcParams['figure.figsize'] = (15, 9)
- print('Begin plotting!')
-
- for cols in df_plot.columns[1:]:
- fig, ax1 = plt.subplots()
- color = 'tab:blue'
- ax1.set_xlabel(f'Time', fontsize=24, weight='bold')
- ax1.set_ylabel(f'Rainfall {cols} (mm)', color=color, fontsize=24, weight='bold')
- ax1.bar(pd.to_datetime(df_plot['Date'], format="%d/%m/%Y"), df_plot[f'{cols}'], color=color, width=4, alpha=1.0)
- ax1.tick_params(axis='y', labelcolor=color, labelsize=24)
- ax1.tick_params(axis='x')
- ax1.set_xticklabels(df_plot['Date'], fontsize=21, weight='bold')
- ax1.grid(color='gray', linestyle='--', linewidth=0.8)
- ax1.set(facecolor="white")
- ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis
-
- color = 'tab:red'
- ax2.set_ylabel('Water level/Stage (m)', color=color, fontsize=24, weight='bold')
- ax2.plot(pd.to_datetime(df_plot['Date'], format="%d/%m/%Y"), water_list, color=color, linewidth=4)
- ax2.tick_params(axis='y', labelcolor=color, labelsize=24)
- ax2.set(facecolor="white")
- plt.title('Stage and Rainfall against Time', fontsize=22, weight='bold')
-
- date_form = DateFormatter("%m-%y")
- ax1.xaxis.set_major_formatter(date_form)
- fig.tight_layout()
-
- if save:
- fig.savefig(f'{cols}.png', dpi=dpi)
-
-```
-
-
-```python
-plot_figs(stations_df, lag_[list(lag_.keys())[0]]['water_list'], list(lag_.keys()), save=True, date='12-05-2020')
-```
-
- Begin plotting!
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_27_1.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_27_2.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_27_3.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_27_4.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_27_5.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_27_6.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_27_7.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_27_8.png)
-
-
-
-
-
-![png](water_level_pipeline_files/water_level_pipeline_27_9.png)
-
-
-
-Format to get the stations maetadata
-
-
-```python
-def filter_metadata(lag_keys):
- captured_list = [i.split('_')[0] for i in list(lag_keys)]
- return fs.get_stations_info(multipleStations=captured_list)
-```
-
-
-```python
-filter_metadata(list(lag_.keys()))
-```
-
- API request: services/assets/v2/stations
-
-
-
-
-
-
-
-
-
-
-
-
code
-
status
-
installationdate
-
elevationground
-
sensorinstallations
-
dataloggerinstallations
-
creatorid
-
created
-
updaterid
-
updated
-
...
-
location.countrycode
-
location.zipcode
-
location.latitude
-
location.longitude
-
location.elevationmsl
-
location.note
-
location.creatorid
-
location.created
-
location.updaterid
-
location.updated
-
-
-
-
-
26
-
TA00028
-
1
-
2015-08-31T00:00:00Z
-
9.0
-
None
-
None
-
2
-
2018-12-11T08:35:17.888233Z
-
2
-
2018-12-11T08:35:17.888233Z
-
...
-
KE
-
-
0.055219
-
37.136747
-
2003.6
-
{}
-
2
-
2018-10-26T13:32:16.15537Z
-
37
-
2022-06-30T11:11:50.27135Z
-
-
-
27
-
TA00029
-
1
-
2015-09-02T00:00:00Z
-
2.0
-
None
-
None
-
2
-
2018-12-11T08:36:19.30342Z
-
2
-
2018-12-11T08:36:19.30342Z
-
...
-
KE
-
-
-0.500776
-
36.587511
-
2545.8
-
{}
-
2
-
2018-10-26T13:33:31.451613Z
-
37
-
2022-02-28T12:25:09.578242Z
-
-
-
53
-
TA00057
-
1
-
2015-10-08T00:00:00Z
-
2.0
-
None
-
None
-
2
-
2018-12-11T09:21:29.092833Z
-
2
-
2018-12-11T09:21:29.092833Z
-
...
-
KE
-
-
-1.253030
-
36.856487
-
1645.3
-
{}
-
2
-
2018-10-29T09:13:33.768613Z
-
2
-
2022-07-26T07:34:06.603938Z
-
-
-
68
-
TA00074
-
1
-
2015-11-19T00:00:00Z
-
2.0
-
None
-
None
-
2
-
2018-12-11T09:38:25.742397Z
-
2
-
2018-12-11T09:38:25.742397Z
-
...
-
KE
-
-
-0.566080
-
37.074412
-
1726.8
-
{}
-
2
-
2018-10-29T10:35:28.49617Z
-
2
-
2022-07-26T07:38:42.100985Z
-
-
-
74
-
TA00080
-
1
-
2016-01-28T00:00:00Z
-
2.0
-
None
-
None
-
2
-
2018-12-11T09:43:10.523398Z
-
2
-
2018-12-11T09:43:10.523398Z
-
...
-
KE
-
-
-1.087589
-
36.818402
-
1777.3
-
{}
-
2
-
2018-10-29T10:53:47.845042Z
-
37
-
2022-02-28T13:07:04.709903Z
-
-
-
150
-
TA00166
-
1
-
2017-05-11T00:00:00Z
-
2.0
-
None
-
None
-
2
-
2018-12-12T08:29:28.10697Z
-
2
-
2018-12-12T08:29:28.10697Z
-
...
-
KE
-
-
-0.319508
-
37.659139
-
1404.0
-
{}
-
2
-
2018-11-10T08:47:37.949135Z
-
2
-
2018-11-10T08:47:37.949135Z
-
-
-
-
6 rows × 28 columns
-
-
-
-
-```python
+Getting Started
+---------------
+All methods require an API key and secret, which can be obtained by contacting TAHMO.
+- The ```retreive_data``` class is used to retrieve data from the TAHMO API endpoints.
+- The ```Filter``` class is used to filter weather stations data based on things like distance and region.
+- The ```pipeline``` class is used to create a pipeline of filters to apply to weather stations based on how they correlate with water level data.
+- The ```Interactive_maps``` class is used to plot weather stations on an interactive map.
-```
+For instructions on shedding weather stations based on your water level data and gauging station coordinates, please refer to the [water_level_pipeline.md](https://github.com/kaburia/filter-stations/water_level_pipeline.md) file.
\ No newline at end of file
diff --git a/filter-stations/__init__.py b/filter-stations/__init__.py
index 6b08a63..259acca 100644
--- a/filter-stations/__init__.py
+++ b/filter-stations/__init__.py
@@ -1,3 +1,36 @@
+"""
+Installation
+------------
+To install the package, run the following command in your terminal:
+```bash
+pip install -U filter-stations
+```
+Getting Started
+---------------
+All methods require an API key and secret, which can be obtained by contacting TAHMO.
+- The ```retreive_data``` class is used to retrieve data from the TAHMO API endpoints.
+- The ```Filter``` class is used to filter weather stations data based on things like distance and region.
+- The ```pipeline``` class is used to create a pipeline of filters to apply to weather stations based on how they correlate with water level data.
+- The ```Interactive_maps``` class is used to plot weather stations on an interactive map.
+
+```python
+# Import the necessary modules
+from filter_stations import retreive_data, Filter, pipeline, Interactive_maps
+
+# Define the API key and secret
+apiKey = 'your_api_key' # request from TAHMO
+apiSecret = 'your_api_secret' # request from TAHMO
+maps_key = 'your_google_maps_key' # retrieve from google maps platform
+
+# Initialize the class
+ret = retreive_data(apiKey, apiSecret, maps_key)
+fs = Filter(apiKey, apiSecret, maps_key)
+pipe = pipeline(apiKey, apiSecret, maps_key)
+maps = Interactive_maps(apiKey, apiSecret, maps_key)
+```
+
+
+"""
import requests
import urllib.parse
import pandas as pd
@@ -46,7 +79,6 @@
# Get data class
class retreive_data:
- # initialize the class
def __init__(self, apiKey, apiSecret, api_key):
self.apiKey = apiKey
self.apiSecret = apiSecret
@@ -91,6 +123,21 @@ def get_stations_info(self, station=None, multipleStations=[], countrycode=None)
-----------
- pandas.DataFrame: DataFrame containing information about the requested weather stations.
+ Usage:
+ -----------
+ To retrieve information about a single station:
+ ```python
+ station_info = ret.get_stations_info(station='TA00001')
+ ```
+ To retrieve information about multiple stations:
+ ```python
+ station_info = ret.get_stations_info(multipleStations=['TA00001', 'TA00002'])
+ ```
+ To retrieve information about all stations in a country:
+ ```python
+ station_info = ret.get_stations_info(countrycode='KE')
+ ```
+
"""
# Make API request and convert response to DataFrame
response = self.__request(endpoints['STATION_INFO'], {'sort':'code'})
@@ -148,9 +195,6 @@ def __splitDateRange(self, inputStartDate, inputEndDate):
df['end'].iloc[-1] = pd.Timestamp(endDate)
return df
- def raw_measurements(self, station, startDate=None, endDate=None, variables=None):
- return self.get_measurements(station, startDate=startDate, endDate=endDate, variables=variables, dataset='raw')
-
def k_neighbours(self, station, number=5):
"""
Returns a dictionary of the nearest neighbouring stations to the specified station.
@@ -237,29 +281,57 @@ def trained_models(self, columns=None):
return self.__handleApiError(apiRequest)
- def aggregate_variables(self, dataframe):
+ def aggregate_variables(self, dataframe, freq='1D'):
"""
Aggregates a pandas DataFrame of weather variables by summing values across each day.
Parameters:
-----------
- dataframe (pandas.DataFrame): DataFrame containing weather variable data.
+ - freq (str, optional): Frequency to aggregate the data by. Defaults to '1D'.
Returns:
-----------
- pandas.DataFrame: DataFrame containing aggregated weather variable data, summed by day.
+
+ Usage:
+ -----------
+ Define the DataFrame containing the weather variable data:
+ ```python
+ dataframe = ret.get_measurements('TA00001', '2020-01-01', '2020-01-31', ['pr']) # data comes in 5 minute interval
+ ```
+ To aggregate data hourly:
+ ```python
+ hourly_data = ret.aggregate_variables(dataframe, freq='1H')
+ ```
+ To aggregate data by 12 hours:
+ ```python
+ half_day_data = ret.aggregate_variables(dataframe, freq='12H')
+ ```
+ To aggregate data by day:
+ ```python
+ daily_data = ret.aggregate_variables(dataframe, freq='1D')
+ ```
+ To aggregate data by week:
+ ```python
+ weekly_data = ret.aggregate_variables(dataframe, freq='1W')
+ ```
+ To aggregate data by month:
+ ```python
+ monthly_data = ret.aggregate_variables(dataframe, freq='1M')
+ ```
"""
dataframe = dataframe.reset_index()
dataframe.rename(columns={'index':'Date'}, inplace=True)
# check if the column is all nan
if dataframe.iloc[:, 1].isnull().all():
return dataframe.groupby(pd.Grouper(key='Date', axis=0,
- freq='1D')).agg({f'{dataframe.columns[1]}':
+ freq=freq)).agg({f'{dataframe.columns[1]}':
lambda x: np.nan if x.isnull().all()
else x.isnull().sum()})
else:
return dataframe.groupby(pd.Grouper(key='Date', axis=0,
- freq='1D')).sum()
+ freq=freq)).sum()
# aggregate qualityflags
def aggregate_qualityflags(self, dataframe):
@@ -285,7 +357,7 @@ def aggregate_qualityflags(self, dataframe):
# Get the variables only
- def get_measurements(self, station, startDate=None, endDate=None, variables=None, dataset='controlled', aggregate=False, quality_flags=False):
+ def get_measurements(self, station, startDate=None, endDate=None, variables=None, dataset='controlled', aggregate='5min', quality_flags=False):
"""
Get measurements from a station.
@@ -303,6 +375,36 @@ def get_measurements(self, station, startDate=None, endDate=None, variables=None
-----------
- A DataFrame containing the measurements.
+ Usage:
+ -----------
+ To retrieve precipitation data for a station for the last month:
+ ```python
+ from datetime import datetime, timedelta
+
+ # Get today's date
+ today = datetime.now()
+
+ # Calculate one month ago
+ last_month = today - timedelta(days=30)
+
+ # Format date as a string
+ last_month_str = last_month.strftime('%Y-%m-%d')
+ today_str = today.strftime('%Y-%m-%d')
+
+ # Define the station you want to retrieve data from
+ station = 'TA00001'
+ variables = ['pr']
+ dataset = 'raw'
+
+ # aggregate the data to 30 minutes interval
+ aggregate = '30min'
+
+ # Call the get_measurements method to retrieve and aggregate data
+ TA00001_data = ret.get_measurements(station, last_month_str,
+ today_str, variables,
+ dataset, aggregate)
+ ```
+
"""
#print('Get measurements', station, startDate, endDate, variables)
endpoint = 'services/measurements/v2/stations/%s/measurements/%s' % (station, dataset)
@@ -422,8 +524,11 @@ def get_measurements(self, station, startDate=None, endDate=None, variables=None
# Merge all series together.
if len(series) > 0:
df = pd.concat(series, axis=1, sort=True)
+
else:
df = pd.DataFrame()
+
+
# Clean up memory.
del series
@@ -431,46 +536,80 @@ def get_measurements(self, station, startDate=None, endDate=None, variables=None
# check if dataframe is empty
if df.empty:
# add the date range in the dataframe and the column as the station filled with NaN
- df = pd.DataFrame(index=pd.date_range(start=startDate, end=endDate, tz='UTC', freq='5min'), columns=[f'{station}'])
- if quality_flags:
- if aggregate:
- return self.aggregate_qualityflags(df)
- else:
- return df
+ df = pd.DataFrame(index=pd.date_range(start=startDate, end=endDate, tz='UTC', freq=aggregate), columns=[f'{station}'])
+ # remove the last row
+ return df[:-1]
+
else:
- if aggregate:
- return self.aggregate_variables(df)
- else:
- return df
-
- # retrieve data from multiple at a time
- def retrieve_data(self, station, startDate, endDate, variables, dataset, aggregate):
- try:
- data = self.get_measurements(station, startDate, endDate, variables, dataset, aggregate)
- return data
- except Exception as e:
- return station, str(e)
-
- def multiple_measurements(self, stations_list, csv_file, startDate, endDate, variables, dataset='controlled', aggregate=True):
+ # remove the last row
+ df = df[:-1] # lacks values for the last day
+ return self.aggregate_variables(df, freq=aggregate)
+
+ def multiple_measurements(self,
+ stations_list,
+ startDate,
+ endDate,
+ variables,
+ dataset='controlled',
+ csv_file=None,
+ aggregate='1D'):
"""
- Retrieves measurements for multiple stations and saves the aggregated data to a CSV file.
+ Retrieves measurements for multiple stations within a specified date range.
Parameters:
-----------
- - stations_list (list): A list of strings containing the names of the stations to retrieve data from.
- - csv_file (str): The name of the CSV file to save the data to.
+ - stations_list (list): A list of strings containing the codes of the stations to retrieve data from.
- startDate (str): The start date for the measurements, in the format 'yyyy-mm-dd'.
- endDate (str): The end date for the measurements, in the format 'yyyy-mm-dd'.
- variables (list): A list of strings containing the names of the variables to retrieve.
- - dataset (str): The name of the dataset to retrieve the data from. Default is 'controlled'.
+ - dataset (str): The name of the database to retrieve the data from. Default is 'controlled' alternatively 'raw' database.
+ - csv_file (str, optional): pass the name of the csv file to save the data otherwise it will return the dataframe.
+ - aggregate (bool): If True, aggregate the data per day; otherwise, return data in 5 minute interval.
Returns:
-----------
- df (pandas.DataFrame): A DataFrame containing the aggregated data for all stations.
Raises:
+ -----------
+ - ValueError: If stations_list is not a list.
+
+ ### Example Usage:
+ To retrieve precipitation data for stations in Kenya for the last week and save it as a csv file:
+ ```python
+ # Import the necessary modules
+ from datetime import datetime, timedelta
+ from filter_stations import retreive_data
+
+ # An instance of the retreive_data class
+ ret = retreive_data(apiKey, apiSecret, maps_key)
+
+ # Get today's date
+ today = datetime.now()
- ValueError: If stations_list is not a list.
+ # Calculate one week ago
+ last_week = today - timedelta(days=7)
+
+ # Format date as a string
+ last_week_str = last_week.strftime('%Y-%m-%d')
+ today_str = today.strftime('%Y-%m-%d')
+
+ # Define the list of stations you want to retrieve data from example stations in Kenya
+ stations = list(ret.get_stations_info(countrycode='KE')['code'])
+
+ # Get the precipitation data for the stations in the list
+ variables = ['pr']
+
+ # retrieve the raw data for the stations, aggregate the data and save it as a csv file
+ dataset = 'raw'
+ aggregate = '1D'
+ csv_file = 'Kenya_precipitation_data'
+
+ # Call the multiple_measurements method to retrieve and aggregate data
+ aggregated_data = ret.multiple_measurements(stations, last_week_str,
+ today_str, variables,
+ dataset, csv_file, aggregate)
+ ```
"""
if not isinstance(stations_list, list):
raise ValueError('Pass in a list')
@@ -482,7 +621,7 @@ def multiple_measurements(self, stations_list, csv_file, startDate, endDate, var
results = []
with tqdm(total=len(stations_list), desc='Retrieving data for stations') as pbar:
for station in stations_list:
- results.append(pool.apply_async(self.retrieve_data, args=(station, startDate, endDate, variables, dataset, aggregate), callback=lambda _: pbar.update(1)))
+ results.append(pool.apply_async(self.get_measurements, args=(station, startDate, endDate, variables, dataset, aggregate), callback=lambda _: pbar.update(1)))
pool.close()
pool.join()
@@ -491,8 +630,11 @@ def multiple_measurements(self, stations_list, csv_file, startDate, endDate, var
if len(df_stats) > 0:
df = pd.concat(df_stats, axis=1)
- df.to_csv(f'{csv_file}.csv')
- return df
+ if csv_file:
+ df.to_csv(f'{csv_file}.csv')
+ return df
+ else:
+ return df
except Exception as e:
print(f"An error occurred: {e}")
finally:
@@ -565,16 +707,16 @@ def anomalies_report(self, start_date, end_date=None):
Usage:
-----------
To retrieve anomaly reports for a specific date range:
- ```
+ ```python
start_date = '2023-01-01'
end_date = '2023-01-31'
- report_data = your_instance.anomalies_report(start_date, end_date)
+ report_data = ret.anomalies_report(start_date, end_date)
```
To retrieve anomaly reports for a specific date:
```
start_date = '2023-01-01'
- report_data = your_instance.anomalies_report(start_date)
+ report_data = ret.anomalies_report(start_date)
```
"""
reqUrl = "https://datahub.tahmo.org/custom/sensordx/reports" # endpoint
@@ -591,15 +733,123 @@ def anomalies_report(self, start_date, end_date=None):
anomalies_data = pd.DataFrame(apiRequest.json()['qualityObjects'])
level_2 = anomalies_data[(anomalies_data.level == 2) & (anomalies_data.type == 'sensordx')]
level_2['station_sensor'] = level_2['stationCode'] + '_' + level_2['sensorCode']
- level_2 = level_2[['startDate', 'station_sensor', 'level']]
+ level_2 = level_2[['startDate', 'station_sensor', 'description', 'level']]
level_2.startDate = pd.to_datetime([dateutil.parser.parse(i).strftime('%Y-%m-%d') for i in level_2['startDate']])
level_2.set_index('startDate', inplace=True)
- if end_date:
- return level_2.loc[start_date:end_date]
- else:
- return level_2.loc[start_date]
+ level_2 = level_2.sort_index()
+ # print(level_2)
+ try:
+ if end_date:
+ return level_2.loc[start_date:end_date]
+ else:
+ return level_2.loc[start_date]
+ except KeyError as e:
+ return e
+ else:
+ return self.__handleApiError(apiRequest)
+
+ # get the ground truth data
+ def ground_truth(self, start_date, end_date=None, level=3):
+ """
+ Retrieves ground truth data for a specified date range.
+
+ Parameters:
+ -----------
+ - start_date (str): The start date for the report in 'yyyy-mm-dd' format.
+ - end_date (str, optional): The end date for the report in 'yyyy-mm-dd' format.
+ If not provided, only data for the start_date is returned.
+
+ Returns:
+ -----------
+ - pandas.DataFrame: A DataFrame containing ground truth data with columns 'startDate',
+ 'station_sensor', 'description' and 'level'. The 'startDate' column is used as the index.
+
+ Raises:
+ -----------
+ - Exception: If there's an issue with the API request.
+
+ Usage:
+ -----------
+ To retrieve ground truth data for a specific date range:
+ ```python
+ start_date = '2023-01-01'
+ end_date = '2023-01-31'
+ report_data = ret.ground_truth(start_date, end_date)
+ ```
+
+ To retrieve ground truth data for a specific date:
+ ```
+ start_date = '2023-01-01'
+ report_data = ret.ground_truth(start_date)
+ ```
+ """
+ reqUrl = "https://datahub.tahmo.org/custom/sensordx/reports" # endpoint
+ # response = self.__request(reqUrl, {})
+ print(f'API request: {reqUrl}')
+ apiRequest = requests.get(f'{reqUrl}',
+ params={},
+ auth=requests.auth.HTTPBasicAuth(
+ self.apiKey,
+ self.apiSecret
+ )
+ )
+ if apiRequest.status_code == 200:
+ reports = pd.DataFrame(apiRequest.json()['qualityObjects'])
+ reports = reports[reports.level != 2][['startDate', 'endDate', 'stationCode', 'sensorCode', 'description', 'level']]
+ reports['station_sensor'] = reports.stationCode + '_' + reports.sensorCode
+ reports = reports.drop(['stationCode', 'sensorCode'], axis=1)
+
+ # convert the start and end date to datetime format
+ reports['startDate'] = pd.to_datetime(reports['startDate']).dt.tz_localize(None)
+ reports['endDate'] = pd.to_datetime(reports['endDate']).dt.tz_localize(None)
+ # convert start_date string to datetime format
+ start_date_dt = pd.to_datetime(start_date).tz_localize(None)
+
+ try:
+ if end_date is None:
+ # check for the date
+ def check_date(row):
+ if row.startDate <= start_date_dt and row.endDate >= start_date_dt:
+ return start_date
+ reports['Date'] = reports.apply(check_date, axis=1)
+ reports = reports.dropna()
+ reports = reports[['Date', 'station_sensor', 'description', 'level']]
+ reports.set_index('Date', inplace=True)
+ return reports
+ else:
+ # convert end_date string to datetime format
+ end_date_dt = pd.to_datetime(end_date).tz_localize(None)
+
+ # Define a function to check if a date is within a range
+ def check_date(row, date):
+ return row['startDate'] <= date and row['endDate'] >= date
+ reports_list = []
+ # Iterate over the date range
+ for single_date in pd.date_range(start_date, end_date):
+ # Filter the reports for the current date
+ filtered_reports = reports[reports.apply(check_date, axis=1, date=single_date)]
+
+ # Add the current date as a new column
+ filtered_reports['Date'] = single_date
+
+ # Append the filtered reports to the list
+ reports_list.append(filtered_reports)
+ filtered_reports_df = pd.concat(reports_list)
+
+ # Drop the startDate and endDate columns
+ filtered_reports_df = filtered_reports_df.drop(['startDate', 'endDate'], axis=1)
+ # Set the index to the Date column
+ filtered_reports_df.set_index('Date', inplace=True)
+ return filtered_reports_df
+
+ except KeyError as e:
+ return e
+
+
+
else:
return self.__handleApiError(apiRequest)
+
'''
A specific class to evaluate and validate the water level data using TAHMO Stations
To be used as it is to maintain flow
@@ -623,7 +873,9 @@ def stations_within_radius(self, radius, latitude, longitude, df=False):
- df (bool, optional): Flag indicating whether to return the result as a DataFrame. Defaults to False.
Returns:
- - DataFrame or list: DataFrame or list containing the stations within the specified radius. If df is True, a DataFrame is returned with the columns 'code', 'location.latitude', 'location.longitude', and 'distance'. If df is False, a list of station codes is returned.
+ - DataFrame or list: DataFrame or list containing the stations within the specified radius. If df is True,
+ a DataFrame is returned with the columns 'code', 'location.latitude', 'location.longitude', and 'distance'.
+ If df is False, a list of station codes is returned.
"""
stations = super().get_stations_info()
@@ -716,7 +968,7 @@ def calculate_lag(self, weather_stations_data, water_level_data, lag=3, above=No
def shed_stations(self, weather_stations_data, water_level_data,
gauging_station_coords, radius, lag=3,
- percentage=1, above=None, below=None):
+ percentage=1):
"""
Filters and processes weather station data to identify stations
potentially contributing to water level changes above or below
@@ -749,6 +1001,33 @@ def shed_stations(self, weather_stations_data, water_level_data,
positive correlations and lagged changes above the specified threshold.
- below_threshold_lag (list): List of weather stations with
positive correlations and lagged changes below the specified threshold.
+
+ Usage:
+ ------------
+ Get the TAHMO stations that correlate with the water level data
+ ```python
+ import pandas as pd
+ from filter_stations import pipeline
+
+ # An instance of the pipeline class
+ pipe = pipeline(apiKey, apiSecret, maps_key)
+
+ # load the water level data and the weather stations data
+ water_level_data = pd.read_csv('water_level_data.csv')
+ weather_stations_data = pd.read_csv('weather_stations_data.csv')
+
+ # get the coordinates of the gauging station
+ gauging_station_coords = (-0.416, 36.951)
+
+ # get the stations within a radius of 200km from the gauging station
+ radius = 200
+
+ # get the stations that correlate with the water level data
+ above_threshold_lag, below_threshold_lag = pipe.shed_stations(weather_stations_data, water_level_data,
+ gauging_station_coords, radius,
+ lag=3, percentage=1)
+ ```
+
"""
# Filter the date range based on the water level data from first day of the water level data to the last day of the water level data
weather_stations_data = weather_stations_data.loc[water_level_data.index[0]:water_level_data.index[-1]]
@@ -1280,6 +1559,13 @@ def animation_grid(self, mu_pred, xi, xj, valid_station_df, clogged_station_df,
Returns:
-----------
- HTML: The animation as an HTML object.
+
+ The animation as an MP4 file
+
+
+
"""
fig, ax = plt.subplots()
@@ -1402,6 +1688,10 @@ def get_map(self, subset_list, start_date=None, end_date=None, data_values=False
--------
- my_map : folium.folium.Map
A Folium map object showing the locations of the weather stations in the given subsets.
+
+
+
+
"""
# Read the csv file
df_rainfall = pd.read_csv(csv_file)
@@ -1454,7 +1744,7 @@ def get_map(self, subset_list, start_date=None, end_date=None, data_values=False
# From the loaded data on the jobs scored, format the data
class transform_data:
- # inherit from retrieve_data class
+
def __init__(self, apiKey, apiSecret, api_key):
super().__init__(apiKey, apiSecret, api_key)
diff --git a/setup.py b/setup.py
index d188172..293a97b 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
setup(
name='filter_stations',
- version='0.4.6',
+ version='0.5.1',
packages=find_packages(),
include_package_data=True,
description='Making it easier to navigate and clean TAHMO weather station data for ML development',
diff --git a/water_level_pipeline.md b/water_level_pipeline.md
new file mode 100644
index 0000000..f530ef6
--- /dev/null
+++ b/water_level_pipeline.md
@@ -0,0 +1,1851 @@
+
+## Water Level Pipeline
+- A series of functions to be added to the filter-stations module in pypi to evalute which TAHMO stations to use that corroborates with the water level
+- All begins with the coordinates of the gauging station(location of the monitoring sensor)
+
+
+```python
+import os
+from pathlib import Path
+import haversine as hs
+import pandas as pd
+import numpy as np
+import datetime
+import statsmodels.api as sm
+from matplotlib.dates import DateFormatter
+import matplotlib.pyplot as plt
+import warnings
+import dateutil.parser
+warnings.filterwarnings('ignore')
+
+# config_path
+config_path = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'config.json')
+```
+
+
+```python
+from filter_stations import retreive_data, Interactive_maps, Filter, pipeline
+import json
+# Authentication
+with open(config_path) as f:
+ conf = json.load(f)
+
+apiKey = conf['apiKey']
+apiSecret = conf['apiSecret']
+map_api_key = conf['map_api_key']
+fs = retreive_data(apiKey, apiSecret, map_api_key)
+pipe = pipeline(apiKey, apiSecret, map_api_key)
+maps = Interactive_maps(apiKey, apiSecret, map_api_key)
+```
+
+### Loading data
+Load the water level data from the github repository[Link here]
+Load the TAHMO station data from the [Link here]
+
+
+```python
+# muringato
+muringato_loc = [-0.406689, 36.96301]
+# ewaso
+ewaso_loc = [0.026833, 36.914637]
+
+# Weather stations data
+weather_stations_data = pd.read_csv(os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'stations_precipitation.csv'))
+
+''' The water level data '''
+# muringato data sensor 2 2021
+muringato_data_s2_2021 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water_data_2021', 'muringato-sensor2.csv')
+
+# muringato data sensor 2 2022
+muringato_data_s2_2022 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water_data_2021', 'muringato-sensor2-2022.csv')
+
+# muringato data sensor 6 2021
+muringato_data_s6_2021 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water_data_2021', 'muringato-sensor6.csv')
+
+# muringato data sensor 6 2022
+muringato_data_s6_2022 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water_data_2021', 'muringato-sensor6-2022.csv')
+
+
+# ewaso data sensor 2020 convert the time column to datetime
+ewaso_data_2020 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water-level-data-ewaso', '1E2020.csv')
+
+# ewaso data sensor 2022
+ewaso_data_2022 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water-level-data-ewaso', '1E2022.csv')
+
+weather_stations_data.Date = weather_stations_data.Date.astype('datetime64[ns]')
+weather_stations_data.set_index('Date', inplace=True)
+
+```
+
+To format water level it needs to have a time column and water level column the names can be different but the order must be that
+
+
+```python
+# handle the water level data
+def format_water_level(water_level_data_path):
+ # data needs to be in the format time, data/water_level or whatever the column is called
+ water_level_data = pd.read_csv(water_level_data_path)
+ # rename the first column to time
+ water_level_data.rename(columns={water_level_data.columns[0]: 'time'}, inplace=True)
+ # convert the time column to datetime
+ water_level_data.time = pd.to_datetime([dateutil.parser.parse(i).strftime('%d-%m-%Y') for i in water_level_data['time']])
+ water_level_data.time = water_level_data.time.astype('datetime64[ns]')
+ # rename the column to water_level
+ water_level_data.rename(columns={water_level_data.columns[1]: 'water_level'}, inplace=True)
+ # set the time column as the index
+ water_level_data.set_index('time', inplace=True)
+ return water_level_data
+```
+
+
+```python
+muringato_data_s2_2021 = format_water_level(muringato_data_s2_2021)
+muringato_data_s2_2022 = format_water_level(muringato_data_s2_2022)
+muringato_data_s6_2021 = format_water_level(muringato_data_s6_2021)
+muringato_data_s6_2022 = format_water_level(muringato_data_s6_2022)
+ewaso_data_2020 = format_water_level(ewaso_data_2020)
+ewaso_data_2022 = format_water_level(ewaso_data_2022)
+
+```
+
+1. Filter the date range based on the water level data from first day of the water level data to the last day of the water level data
+2. Choose stations within a certain radius of the gauging station 100 km for example get the resulting weather data
+3. Get the stations with only 100 percent data no missing data
+4. Remove the stations data with the value zero from beginning to end if the water level data has some values above zero
+5. Calculate the correlation between the water level data and the weather data needs to be above 0 and have a lag of maximum 3 days
+6. Plot the resulting figures
+
+
+### Choosing ewaso 2020 range
+removing stations with missing data reduces from 1035 to 849 columns
+removing all zeros reduces from 849 to 604 columns
+columns with positive correlation reduces the number from 604 columns to 283 columns
+checking for lag reduces the columns to 80
+
+
+```python
+above, below = pipe.shed_stations(weather_stations_data,
+ muringato_data_s6_2022,
+ muringato_loc,
+ 100,
+ lag=3
+ )
+
+```
+
+
+```python
+below_stations = [i.split('_')[0] for i in below.keys()]
+print(below_stations)
+below_stations_metadata = fs.get_stations_info(multipleStations=below_stations)[['code', 'location.latitude', 'location.longitude']]
+```
+
+ ['TA00001', 'TA00023', 'TA00024', 'TA00025', 'TA00054', 'TA00056', 'TA00067', 'TA00077', 'TA00129', 'TA00147', 'TA00154', 'TA00155', 'TA00156', 'TA00166', 'TA00171', 'TA00189', 'TA00215', 'TA00222', 'TA00228', 'TA00230', 'TA00233', 'TA00250', 'TA00270', 'TA00270', 'TA00272', 'TA00272', 'TA00316', 'TA00317', 'TA00355', 'TA00459', 'TA00473', 'TA00480', 'TA00493', 'TA00494', 'TA00577', 'TA00601', 'TA00621', 'TA00653', 'TA00672', 'TA00676', 'TA00679', 'TA00692', 'TA00699', 'TA00704', 'TA00705', 'TA00711', 'TA00712', 'TA00712', 'TA00715', 'TA00717', 'TA00750', 'TA00751', 'TA00767']
+
+
+
+```python
+below_stations_metadata['distance']= below_stations_metadata.apply(lambda row: hs.haversine((muringato_loc[0],
+ muringato_loc[1]), (row['location.latitude'],
+ row['location.longitude'])), axis=1)
+below_stations_metadata.sort_values(by='distance')
+```
+
+
+
+
+
+
+
+
+
+
+
code
+
location.latitude
+
location.longitude
+
distance
+
+
+
+
+
52
+
TA00056
+
-0.721656
+
37.145585
+
40.480889
+
+
+
22
+
TA00024
+
-1.071731
+
37.045578
+
74.517013
+
+
+
150
+
TA00166
+
-0.319508
+
37.659139
+
78.009238
+
+
+
172
+
TA00189
+
-0.795260
+
37.665930
+
89.304790
+
+
+
230
+
TA00250
+
-0.778940
+
37.676738
+
89.504935
+
+
+
600
+
TA00715
+
-1.225618
+
36.809065
+
92.655456
+
+
+
565
+
TA00679
+
-1.270835
+
36.723916
+
99.698089
+
+
+
23
+
TA00025
+
-1.301839
+
36.760200
+
102.058383
+
+
+
422
+
TA00473
+
-0.512371
+
35.956813
+
112.495996
+
+
+
513
+
TA00621
+
-1.633020
+
37.146185
+
137.874253
+
+
+
51
+
TA00054
+
-0.239342
+
35.728897
+
138.480985
+
+
+
424
+
TA00480
+
-1.376152
+
37.797646
+
142.238019
+
+
+
61
+
TA00067
+
-1.794285
+
37.621211
+
170.765765
+
+
+
140
+
TA00156
+
-1.701123
+
38.068339
+
189.255406
+
+
+
71
+
TA00077
+
-0.383066
+
35.068406
+
210.682047
+
+
+
139
+
TA00155
+
-2.523037
+
36.829437
+
235.795373
+
+
+
21
+
TA00023
+
-2.388550
+
38.040767
+
250.831198
+
+
+
155
+
TA00171
+
-0.002710
+
34.596908
+
266.903936
+
+
+
291
+
TA00317
+
0.040440
+
34.371716
+
292.394991
+
+
+
0
+
TA00001
+
-1.123283
+
34.397992
+
296.112467
+
+
+
652
+
TA00767
+
-2.671990
+
38.369665
+
296.467402
+
+
+
290
+
TA00316
+
0.289862
+
34.371222
+
298.418648
+
+
+
131
+
TA00147
+
0.449274
+
34.282303
+
312.905564
+
+
+
117
+
TA00129
+
-3.390926
+
37.717656
+
342.264311
+
+
+
138
+
TA00154
+
-4.231107
+
37.847804
+
436.466702
+
+
+
211
+
TA00230
+
1.724690
+
33.622000
+
440.623881
+
+
+
329
+
TA00355
+
3.498069
+
35.843897
+
451.651266
+
+
+
544
+
TA00653
+
0.265062
+
32.627203
+
487.869319
+
+
+
196
+
TA00215
+
0.052465
+
32.440690
+
505.441217
+
+
+
203
+
TA00222
+
1.186240
+
32.020330
+
577.409865
+
+
+
584
+
TA00699
+
-0.707570
+
31.402138
+
619.216128
+
+
+
558
+
TA00672
+
-6.180302
+
37.146832
+
642.321296
+
+
+
597
+
TA00712
+
-6.676308
+
39.131552
+
737.484276
+
+
+
562
+
TA00676
+
-6.780374
+
38.973512
+
742.978650
+
+
+
635
+
TA00750
+
-6.805316
+
39.139843
+
751.347364
+
+
+
636
+
TA00751
+
-6.848668
+
39.082174
+
753.892793
+
+
+
432
+
TA00494
+
-6.833860
+
39.167475
+
755.338586
+
+
+
248
+
TA00270
+
-6.842390
+
39.156760
+
755.852180
+
+
+
250
+
TA00272
+
-6.890039
+
39.117927
+
759.501414
+
+
+
431
+
TA00493
+
-6.910845
+
39.075597
+
760.236606
+
+
+
214
+
TA00233
+
3.453500
+
31.251250
+
766.277105
+
+
+
209
+
TA00228
+
3.404720
+
30.959600
+
790.422401
+
+
+
498
+
TA00601
+
-14.080148
+
33.907593
+
1557.147407
+
+
+
602
+
TA00717
+
3.898305
+
11.886437
+
2827.236339
+
+
+
590
+
TA00705
+
4.952251
+
8.341692
+
3234.191975
+
+
+
481
+
TA00577
+
10.487147
+
9.788223
+
3240.086078
+
+
+
589
+
TA00704
+
5.378602
+
6.998292
+
3388.907422
+
+
+
596
+
TA00711
+
4.906530
+
6.917064
+
3389.011984
+
+
+
410
+
TA00459
+
9.066148
+
6.569080
+
3526.820348
+
+
+
577
+
TA00692
+
6.404114
+
5.626307
+
3559.025765
+
+
+
+
+
+
+
+
+```python
+# Interactive visuals
+import plotly.express as px
+import plotly.graph_objects as go
+
+fig = px.scatter_mapbox(below_stations_metadata,
+ lat="location.latitude",
+ lon="location.longitude",
+ hover_name="code",
+ hover_data=["distance"],
+ color_discrete_sequence=["fuchsia"],
+ zoom=8,
+ height=800,
+ )
+# update marker size
+fig.update_traces(marker=dict(size=10))
+# add a point for the central station
+fig.add_trace(go.Scattermapbox(
+ lat=[muringato_loc[0]],
+ lon=[muringato_loc[1]],
+ mode='markers',
+ marker=go.scattermapbox.Marker(
+ size=14
+ ),
+ text=['Muringato gauging station'],
+ ))
+
+fig.update_layout(
+ mapbox_style="carto-positron",
+ margin={"r":0,"t":0,"l":0,"b":0},
+ showlegend=False
+)
+fig.show()
+```
+
+
+
+
+```python
+pipe.plot_figs(
+ weather_stations_data,
+ list(muringato_data_s6_2022['water_level']),
+ list(below.keys()),
+ date=dateutil.parser.parse(str(muringato_data_s6_2022.index[0])).strftime('%d-%m-%Y'),
+ save=False
+)
+```
+
+ Begin plotting!
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_1.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_2.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_3.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_4.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_5.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_6.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_7.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_8.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_9.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_10.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_11.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_12.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_13.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_14.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_15.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_16.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_17.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_18.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_19.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_20.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_21.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_22.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_23.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_24.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_25.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_26.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_27.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_28.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_29.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_30.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_31.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_32.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_33.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_34.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_35.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_36.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_37.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_38.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_39.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_40.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_41.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_42.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_43.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_44.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_45.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_46.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_47.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_48.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_49.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_50.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_51.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_52.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_15_53.png)
+
+
+
+
+```python
+RADIUS = 100
+
+ewaso_weather_data_2020 = weather_stations_data.loc[ewaso_data_2020.index[0]:ewaso_data_2020.index[-1]]
+# ewaso stations within a particular radius
+ewaso_tahmo_stations_2020 = pipe.stations_within_radius(RADIUS, ewaso_loc[0], ewaso_loc[1], df=False)
+# Get stations without missing data
+# ewaso weather data
+ewaso_weather_data_2020_filtered = pipe.stations_data_check(stations_list=list(ewaso_tahmo_stations_2020),
+ percentage=1, data=ewaso_weather_data_2020
+ )
+# Check the sum of each column and drop columns with a sum of zero this is if the sum of water level is not equal to zero
+ewaso_weather_data_2020_filtered = ewaso_weather_data_2020_filtered.loc[:, ewaso_weather_data_2020_filtered.sum() != 0]
+```
+
+ API request: services/assets/v2/stations
+
+
+
+```python
+import statsmodels.api as sm
+def calculate_lag(weather_stations_data, water_level_data, lag=3, above=None, below=None):
+ above_threshold_lag = dict()
+ below_threshold_lag = dict()
+ for cols in weather_stations_data.columns:
+ # check for positive correlation if not skip the column
+ if weather_stations_data[cols].corr(water_level_data['water_level']) <= 0:
+ continue
+ # get the lag and the coefficient for columns with a positive correlation
+ coefficient_list = list(sm.tsa.stattools.ccf(weather_stations_data[cols], water_level_data['water_level']))
+ a = np.argmax(coefficient_list)
+ b = coefficient_list[a]
+ # print(f'{cols} has a lag of {a}')
+ # print(f'{cols} has a coefficient of {b}')
+ # print('-----------------------')
+ if a > lag:
+ above_threshold_lag[cols] = a
+ elif a <= lag:
+ below_threshold_lag[cols] = a
+ if above:
+ return above_threshold_lag
+ elif below:
+ return below_threshold_lag
+ else:
+ return above_threshold_lag, below_threshold_lag
+
+
+```
+
+Bringing all the functions together to create a pipeline
+
+
+```python
+def shed_stations(weather_stations_data, water_level_data,
+ gauging_station_coords, radius, lag=3,
+ percentage=1, above=None, below=None):
+ # Filter the date range based on the water level data from first day of the water level data to the last day of the water level data
+ weather_stations_data = weather_stations_data.loc[water_level_data.index[0]:water_level_data.index[-1]]
+ # Filter the weather stations based on the radius
+ lat, lon = gauging_station_coords[0], gauging_station_coords[1]
+ weather_stations_data_list = pipe.stations_within_radius(radius, lat, lon, df=False)
+ # get stations without missing data or the percentage of stations with missing data
+ weather_stations_data_filtered = pipe.stations_data_check(stations_list=weather_stations_data_list,
+ percentage=percentage,
+ data=weather_stations_data)
+ # Check the sum of each column and drop columns with a sum of zero this is if the sum of water level is not equal to zero
+ weather_stations_data_filtered = weather_stations_data_filtered.loc[:, weather_stations_data_filtered.sum() != 0]
+
+ # Filter the weather stations based on the lag and positive correlation
+ above_threshold_lag, below_threshold_lag = calculate_lag(weather_stations_data_filtered, water_level_data, lag=lag)
+
+ return above_threshold_lag, below_threshold_lag
+```
+
+
+```python
+above_threshold_lag, below_threshold_lag = shed_stations(weather_stations_data, ewaso_data_2020, ewaso_loc, RADIUS, lag=3, percentage=1, above=True, below=False)
+len(below_threshold_lag)
+```
+
+ API request: services/assets/v2/stations
+
+
+
+
+
+ 80
+
+
+
+### Plot the figures
+
+
+```python
+pipe.plot_figs(
+ weather_stations_data,
+ list(ewaso_data_2020['water_level']),
+ list(below_threshold_lag.keys()),
+ date=dateutil.parser.parse(str(ewaso_data_2020.index[0])).strftime('%d-%m-%Y'),
+ save=True
+)
+```
+
+ Begin plotting!
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_1.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_2.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_3.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_4.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_5.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_6.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_7.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_8.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_9.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_10.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_11.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_12.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_13.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_14.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_15.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_16.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_17.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_18.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_19.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_20.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_21.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_22.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_23.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_24.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_25.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_26.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_27.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_28.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_29.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_30.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_31.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_32.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_33.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_34.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_35.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_36.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_37.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_38.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_39.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_40.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_41.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_42.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_43.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_44.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_45.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_46.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_47.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_48.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_49.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_50.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_51.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_52.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_53.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_54.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_55.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_56.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_57.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_58.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_59.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_60.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_61.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_62.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_63.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_64.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_65.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_66.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_67.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_68.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_69.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_70.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_71.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_72.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_73.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_74.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_75.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_76.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_77.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_78.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_79.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_22_80.png)
+
+
+
+Input water level data
+Input TAHMO station data
+
+
+
+```python
+# plot the two with different colors
+fig, ax = plt.subplots(figsize=(10, 10))
+muringato_tahmo_stations.plot(kind='scatter',
+ x='location.longitude',
+ y='location.latitude',
+ color='blue',
+ alpha=0.7,
+ ax=ax)
+ewaso_tahmo_stations.plot(kind='scatter',
+ x='location.longitude',
+ y='location.latitude',
+ color='red',
+ alpha=0.7,
+ ax=ax)
+plt.show()
+```
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_24_0.png)
+
+
+
+Apart from the completeness another method of validation by eliminating unusable sensors is checking for a positive correlation and lag
+- The default lag is 3 days between a particular station and the gauging station
+- The required format is a timeseries data
+- Provide the column names for evaluation format = [Date, data]
+- with the change in parameters one can choose above or below threshold
+
+
+```python
+def plot_figs(weather_stations, water_list, threshold_list, save=False, dpi=500, date='11-02-2021'):
+ start_date = datetime.datetime.strptime(date, "%d-%m-%Y")
+ end_date = start_date + datetime.timedelta(len(water_list)-1)
+ # weather_stations = weather_stations.set_index('Date')
+ df_plot = weather_stations[start_date:end_date]
+ df_plot = df_plot[threshold_list].reset_index()
+ df_plot.rename(columns={'index':'Date'}, inplace=True)
+
+
+ plt.rcParams['figure.figsize'] = (15, 9)
+ print('Begin plotting!')
+
+ for cols in df_plot.columns[1:]:
+ fig, ax1 = plt.subplots()
+ color = 'tab:blue'
+ ax1.set_xlabel(f'Time', fontsize=24, weight='bold')
+ ax1.set_ylabel(f'Rainfall {cols} (mm)', color=color, fontsize=24, weight='bold')
+ ax1.bar(pd.to_datetime(df_plot['Date'], format="%d/%m/%Y"), df_plot[f'{cols}'], color=color, width=4, alpha=1.0)
+ ax1.tick_params(axis='y', labelcolor=color, labelsize=24)
+ ax1.tick_params(axis='x')
+ ax1.set_xticklabels(df_plot['Date'], fontsize=21, weight='bold')
+ ax1.grid(color='gray', linestyle='--', linewidth=0.8)
+ ax1.set(facecolor="white")
+ ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis
+
+ color = 'tab:red'
+ ax2.set_ylabel('Water level/Stage (m)', color=color, fontsize=24, weight='bold')
+ ax2.plot(pd.to_datetime(df_plot['Date'], format="%d/%m/%Y"), water_list, color=color, linewidth=4)
+ ax2.tick_params(axis='y', labelcolor=color, labelsize=24)
+ ax2.set(facecolor="white")
+ plt.title('Stage and Rainfall against Time', fontsize=22, weight='bold')
+
+ date_form = DateFormatter("%m-%y")
+ ax1.xaxis.set_major_formatter(date_form)
+ fig.tight_layout()
+
+ if save:
+ fig.savefig(f'{cols}.png', dpi=dpi)
+
+```
+
+
+```python
+plot_figs(stations_df, lag_[list(lag_.keys())[0]]['water_list'], list(lag_.keys()), save=True, date='12-05-2020')
+```
+
+ Begin plotting!
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_27_1.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_27_2.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_27_3.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_27_4.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_27_5.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_27_6.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_27_7.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_27_8.png)
+
+
+
+
+
+![png](water_level_pipeline_files/water_level_pipeline_27_9.png)
+
+
+
+Format to get the stations maetadata
+
+
+```python
+def filter_metadata(lag_keys):
+ captured_list = [i.split('_')[0] for i in list(lag_keys)]
+ return fs.get_stations_info(multipleStations=captured_list)
+```
+
+
+```python
+filter_metadata(list(lag_.keys()))
+```
+
+ API request: services/assets/v2/stations
+
+
+
+
+
+