-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_processing.py
202 lines (165 loc) · 7.34 KB
/
data_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# data_processing.py
import pandas as pd
import numpy as np
# import streamlit as st
# import awn_controller as awn # Assuming awn_controller is your custom module
from log_util import app_logger # Ensure you have this module set up for logging
logger = app_logger(__name__)
def get_human_readable_duration(recent_dateutc, history_dateutc):
"""
Returns a human-centric duration in minutes, hours, or days.
Parameters:
recent_dateutc (int): The last date in UTC from the device.
history_dateutc (int): The maximum date in UTC from the history.
Returns:
str: A human-readable duration.
"""
history_age_minutes = (recent_dateutc - history_dateutc) / 60000
if history_age_minutes < 60:
return f"{history_age_minutes:.0f} minutes"
elif history_age_minutes < 1440:
return f"{history_age_minutes / 60:.1f} hours"
else:
return f"{history_age_minutes / 1440:.1f} days"
def get_history_min_max(df, date_column="date", data_column="tempf", data_label="temp"):
"""
Calculate the minimum and maximum values of a data column for specific time periods.
:param df: pd.DataFrame - Input dataframe with weather data.
:param date_column: str - The column representing dates that are tz-aware.
:param data_column: str - The column representing data.
:param data_label: str - The label for the data column.
:return: dict - Dictionary with min and max values for specific time periods.
"""
# Ensure the date column is datetime and timezone-aware
df[date_column] = pd.to_datetime(df[date_column])
tz = df[date_column].dt.tz
# Get the current timestamp in the same timezone
now = pd.Timestamp.now(tz=tz)
today_start = now.normalize() # Start of the current day
# Define date ranges
date_ranges = {
"today": (today_start, now),
"yesterday": (today_start - pd.Timedelta(days=1), today_start),
"last 7d": (now - pd.Timedelta(days=7), now),
"last 30d": (now - pd.Timedelta(days=30), now),
"last 90d": (now - pd.Timedelta(days=90), now),
"last 365d": (now - pd.Timedelta(days=365), now),
}
results = {}
# Current value is the data_column for the max(date_column)
current_data = df.loc[df[date_column].idxmax(), data_column]
for label, (start, end) in date_ranges.items():
# Filter data for the specific time period
period_data = df[(df[date_column] >= start) & (df[date_column] < end)]
# Calculate min and max values
results[label] = {
"min": period_data[data_column].min(),
"max": period_data[data_column].max(),
"current": current_data,
}
return results
def bin_values(
df, value_col, bin_col_name="value_bin", num_bins=5, max_percentile=0.75
):
"""
Bins continuous numeric values into categories, setting max_value to the 75th percentile
and allowing the number of bins to be specified.
:param df: pd.DataFrame - DataFrame containing the data.
:param value_col: str - Column name for the values to bin.
:param num_bins: int, optional - Number of bins to create. Defaults to 5.
:param bin_col_name: str - Name of the column to store the binned values.
:return: pd.DataFrame, list - DataFrame with binned values in `bin_col_name`, and bin labels.
"""
# Determine min and max from the data
min_value = df[value_col].min()
max_value = df[value_col].quantile(max_percentile)
# Avoid division by zero or negative steps
if max_value <= min_value:
raise ValueError("Max value must be greater than min value to calculate bins.")
# Determine step size based on number of bins
step = (max_value - min_value) / num_bins
step = max(step, 1) # Ensure step is at least 1
# Create bins
value_bins = list(range(int(min_value), int(max_value) + int(step), int(step))) + [
float("inf")
]
value_labels = [
f"{value_bins[i]}-{value_bins[i+1]}" for i in range(len(value_bins) - 2)
] + [f"{int(max_value)}+"]
# Bin values into categories
df[bin_col_name] = pd.cut(
df[value_col], bins=value_bins, labels=value_labels, right=False
)
return df, value_labels
def bin_directions(df, direction_col, sector_size=30, bin_col_name="direction_bin"):
"""
Groups directional data into equal-sized sectors.
:param df: pd.DataFrame - DataFrame containing the directional data to process.
:param direction_col: str - Column name for directional data (degrees).
:param sector_size: int - Size of directional sectors (e.g., 30°).
:param bin_col_name: str - Name of the column to store the binned directions.
:return: pd.DataFrame, list - DataFrame with binned directions in `bin_col_name`,
and sector labels for visualization.
"""
direction_bins = np.arange(0, 361, sector_size)
direction_labels = [
f"{direction_bins[i]}-{direction_bins[i+1]}"
for i in range(len(direction_bins) - 1)
]
df[bin_col_name] = pd.cut(
df[direction_col],
bins=direction_bins,
labels=direction_labels,
right=False,
include_lowest=True,
)
return df, direction_labels
def calculate_percentages(df, group_cols):
"""
Calculates percentage distribution within grouped categories.
:param df: pd.DataFrame - DataFrame containing grouped data.
:param group_cols: list - Columns to group by (e.g., 'value_bin', 'direction_bin').
:return: pd.DataFrame - DataFrame with counts and percentage distributions for each group.
"""
total_count = len(df)
grouped = df.groupby(group_cols, observed=False).size().reset_index(name="count")
grouped["percentage"] = (grouped["count"] / total_count) * 100
return grouped
def prepare_polar_chart_data(
df,
value_col,
direction_col,
num_bins=5,
sector_size=30,
value_bin_col="value_bin",
direction_bin_col="direction_bin",
max_percentile=0.9,
):
"""
Prepares data for a polar chart by binning values, binning directions, and
calculating percentages.
:param df: pd.DataFrame - Input DataFrame containing the raw data.
:param value_col: str - Column name for the continuous values (e.g., wind speed).
:param direction_col: str - Column name for the directional data (e.g., wind direction).
:param num_bins: int - Number of bins for the value column. Defaults to 5.
:param sector_size: int - Size of directional sectors (degrees). Defaults to 30.
:param value_bin_col: str - Column name for the binned values. Defaults to "value_bin".
:param direction_bin_col: str - Column name for the binned directions. Defaults to "direction_bin".
:param max_percentile: float - Percentile to set the maximum binning value. Defaults to 0.9 (90th percentile).
:return: pd.DataFrame, list - Grouped data for the polar chart, and value labels.
"""
# Step 1: Bin continuous values
df, value_labels = bin_values(
df,
value_col,
num_bins=num_bins,
bin_col_name=value_bin_col,
max_percentile=max_percentile,
)
# Step 2: Bin directional values
df, direction_labels = bin_directions(
df, direction_col, sector_size, bin_col_name=direction_bin_col
)
# Step 3: Calculate percentages
grouped_data = calculate_percentages(df, [value_bin_col, direction_bin_col])
return grouped_data, value_labels, direction_labels