Skip to content

Commit

Permalink
Refactor to allow subclasses to set their own time format (#177)
Browse files Browse the repository at this point in the history
* Refactor to allow subclasses to set their own time format

* Add tests to show solution behaves as intended

* Update RELEASE_NOTES

* Make methods more sensible

* Make methods more sensible

* Appease stickler

* simply use to_int plus some pep8

* clean up the logic a bit of column discovery

* Make formatting methods explicit

* Clean up test names

* Appease stickler
  • Loading branch information
znicholls authored and gidden committed Jan 25, 2019
1 parent 8f1d58e commit ce914d7
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 20 deletions.
3 changes: 2 additions & 1 deletion RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@

# Next Release

- [#177](https://github.com/IAMconsortium/pyam/pull/177) Modified formatting of time column on init to allow subclasses to avoid pandas limitation (https://stackoverflow.com/a/37226672)
- [#176](https://github.com/IAMconsortium/pyam/pull/176) Corrected title setting operation in line_plot function
- [#175](https://github.com/IAMconsortium/pyam/pull/175) Update link to tutorial in readme.md
- [#175](https://github.com/IAMconsortium/pyam/pull/175) Update link to tutorial in readme.md
- [#174](https://github.com/IAMconsortium/pyam/pull/174) Add a function `difference()` to compare two IamDataFrames
- [#171](https://github.com/IAMconsortium/pyam/pull/171) Fix a bug when reading from an `ixmp.TimeSeries` object, refactor to mitigate circular dependency
- [#162](https://github.com/IAMconsortium/pyam/pull/162) Add a function to sum and append timeseries components to an aggregate variable
Expand Down
15 changes: 14 additions & 1 deletion pyam/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
read_files,
read_pandas,
format_data,
to_int,
pattern_match,
years_match,
month_match,
Expand Down Expand Up @@ -72,7 +73,14 @@ def __init__(self, data, **kwargs):
_data = read_ix(data, **kwargs)
else:
_data = read_files(data, **kwargs)

self.data, self.time_col, self.extra_cols = _data
# cast time_col to desired format
if self.time_col == 'year':
self._format_year_col()
elif self.time_col == 'time':
self._format_datetime_col()

self._LONG_IDX = IAMC_IDX + [self.time_col] + self.extra_cols

# define a dataframe for categorization and other metadata indicators
Expand All @@ -83,6 +91,12 @@ def __init__(self, data, **kwargs):
if 'exec' in run_control():
self._execute_run_control()

def _format_year_col(self):
self.data['year'] = to_int(pd.to_numeric(self.data['year']))

def _format_datetime_col(self):
self.data['time'] = pd.to_datetime(self.data['time'])

def __getitem__(self, key):
_key_check = [key] if isstr(key) else key
if set(_key_check).issubset(self.meta.columns):
Expand Down Expand Up @@ -890,7 +904,6 @@ def _apply_filters(self, filters):

return keep


def col_apply(self, col, func, *args, **kwargs):
"""Apply a function to a column
Expand Down
9 changes: 4 additions & 5 deletions pyam/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import numpy as np
from pyam.logger import logger
from pyam.utils import isstr, cast_years_to_int
from pyam.utils import isstr, to_int

# %%

Expand Down Expand Up @@ -59,9 +59,8 @@ def cumulative(x, first_year, last_year):
.format(x.name or x, last_year))
return np.nan

# cast tiemseries colums to `int` if necessary
if not x.index.dtype == 'int64':
cast_years_to_int(x, index=True)
# make sure we're using integers
to_int(x, index=True)

x[first_year] = fill_series(x, first_year)
x[last_year] = fill_series(x, last_year)
Expand All @@ -74,7 +73,7 @@ def cumulative(x, first_year, last_year):
if not np.isnan(x[first_year]) and not np.isnan(x[last_year]):
value = 0
for (i, yr) in enumerate(years[:-1]):
next_yr = years[i+1]
next_yr = years[i + 1]
# the summation is shifted to include the first year fully in sum,
# otherwise, would return a weighted average of `yr` and `next_yr`
value += ((next_yr - yr - 1) * x[next_yr] +
Expand Down
17 changes: 6 additions & 11 deletions pyam/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import glob
import collections
import datetime
import dateutil
import time

import numpy as np
Expand Down Expand Up @@ -170,13 +171,14 @@ def format_data(df):
year_cols, time_cols, extra_cols = [], [], []
for i in cols:
try:
year_cols.append(i) if int(i) else None
int(i) # this is a year
year_cols.append(i)
except (ValueError, TypeError):
try:
pd.to_datetime([i])
dateutil.parser.parse(str(i)) # this is datetime
time_cols.append(i)
except ValueError:
extra_cols.append(i)
extra_cols.append(i) # some other string
if year_cols and not time_cols:
time_col = 'year'
melt_cols = year_cols
Expand All @@ -189,13 +191,6 @@ def format_data(df):
df = pd.melt(df, id_vars=IAMC_IDX + extra_cols, var_name=time_col,
value_vars=sorted(melt_cols), value_name='value')

# cast time_col to correct format
if time_col == 'year':
if not df.year.dtype == 'int64':
df['year'] = cast_years_to_int(pd.to_numeric(df['year']))
if time_col == 'time':
df['time'] = pd.to_datetime(df['time'])

# cast value columns to numeric, drop NaN's, sort data
df['value'] = df['value'].astype('float64')
df.dropna(inplace=True)
Expand Down Expand Up @@ -357,7 +352,7 @@ def datetime_match(data, dts):
return data.isin(dts)


def cast_years_to_int(x, index=False):
def to_int(x, index=False):
"""Formatting series or timeseries columns to int and checking validity.
If `index=False`, the function works on the `pd.Series x`; else,
the function casts the index of `x` to int and returns x with a new index.
Expand Down
65 changes: 65 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,71 @@ def test_init_df_with_extra_col(test_pd_df):
tdf, check_like=True)


def test_init_datetime(test_pd_df):
tdf = test_pd_df.copy()
tmin = datetime.datetime(2005, 6, 17)
tmax = datetime.datetime(2010, 6, 17)
tdf = tdf.rename(
{
2005: tmin,
2010: tmax,
},
axis="columns"
)

df = IamDataFrame(tdf)

assert df["time"].max() == tmax
assert df["time"].min() == tmin


@pytest.mark.xfail(reason=(
"pandas datetime is limited to the time period of ~1677-2262, see "
"https://stackoverflow.com/a/37226672"
))
def test_init_datetime_long_timespan(test_pd_df):
tdf = test_pd_df.copy()
tmin = datetime.datetime(2005, 6, 17)
tmax = datetime.datetime(3005, 6, 17)
tdf = tdf.rename(
{
2005: tmin,
2010: tmax,
},
axis="columns"
)

df = IamDataFrame(tdf)

assert df["time"].max() == tmax
assert df["time"].min() == tmin


def test_init_datetime_subclass_long_timespan(test_pd_df):
class TempSubClass(IamDataFrame):
def _format_datetime_col(self):
# the subclass does not try to coerce the datetimes to pandas
# datetimes, instead simply leaving the time column as object type,
# so we don't run into the problem of pandas limited time period as
# discussed in https://stackoverflow.com/a/37226672
pass

tdf = test_pd_df.copy()
tmin = datetime.datetime(2005, 6, 17)
tmax = datetime.datetime(3005, 6, 17)
tdf = tdf.rename(
{
2005: tmin,
2010: tmax,
},
axis="columns"
)

df = TempSubClass(tdf)

assert df["time"].max() == tmax
assert df["time"].min() == tmin


def test_to_excel(test_df):
fname = 'foo_testing.xlsx'
Expand Down
4 changes: 2 additions & 2 deletions tests/test_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import pandas as pd
from pyam.logger import logger
from pyam import fill_series, cumulative, cross_threshold, cast_years_to_int
from pyam import fill_series, cumulative, cross_threshold, to_int
import pytest


Expand All @@ -21,7 +21,7 @@ def test_fill_series_out_of_range():

def test_cols_to_int():
y = pd.Series(data=[np.nan, 1, 3, 1], index=[2002., 2007.5, 2003., 2013.])
pytest.raises(ValueError, cast_years_to_int, x=y)
pytest.raises(ValueError, to_int, x=y)


def test_cumulative():
Expand Down

0 comments on commit ce914d7

Please sign in to comment.