Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor to allow subclasses to set their own time format #177

Merged
merged 12 commits into from
Jan 25, 2019
3 changes: 2 additions & 1 deletion RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@

# Next Release

- [#177](https://github.com/IAMconsortium/pyam/pull/177) Modified formatting of time column on init to allow subclasses to avoid pandas limitation (https://stackoverflow.com/a/37226672)
- [#176](https://github.com/IAMconsortium/pyam/pull/176) Corrected title setting operation in line_plot function
- [#175](https://github.com/IAMconsortium/pyam/pull/175) Update link to tutorial in readme.md
- [#175](https://github.com/IAMconsortium/pyam/pull/175) Update link to tutorial in readme.md
- [#174](https://github.com/IAMconsortium/pyam/pull/174) Add a function `difference()` to compare two IamDataFrames
- [#171](https://github.com/IAMconsortium/pyam/pull/171) Fix a bug when reading from an `ixmp.TimeSeries` object, refactor to mitigate circular dependency
- [#162](https://github.com/IAMconsortium/pyam/pull/162) Add a function to sum and append timeseries components to an aggregate variable
Expand Down
15 changes: 14 additions & 1 deletion pyam/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
read_files,
read_pandas,
format_data,
to_int,
pattern_match,
years_match,
month_match,
Expand Down Expand Up @@ -72,7 +73,14 @@ def __init__(self, data, **kwargs):
_data = read_ix(data, **kwargs)
else:
_data = read_files(data, **kwargs)

self.data, self.time_col, self.extra_cols = _data
# cast time_col to desired format
if self.time_col == 'year':
self._format_year_col()
elif self.time_col == 'time':
self._format_datetime_col()

self._LONG_IDX = IAMC_IDX + [self.time_col] + self.extra_cols

# define a dataframe for categorization and other metadata indicators
Expand All @@ -83,6 +91,12 @@ def __init__(self, data, **kwargs):
if 'exec' in run_control():
self._execute_run_control()

def _format_year_col(self):
self.data['year'] = to_int(pd.to_numeric(self.data['year']))

def _format_datetime_col(self):
self.data['time'] = pd.to_datetime(self.data['time'])

def __getitem__(self, key):
_key_check = [key] if isstr(key) else key
if set(_key_check).issubset(self.meta.columns):
Expand Down Expand Up @@ -890,7 +904,6 @@ def _apply_filters(self, filters):

return keep


def col_apply(self, col, func, *args, **kwargs):
"""Apply a function to a column

Expand Down
9 changes: 4 additions & 5 deletions pyam/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import numpy as np
from pyam.logger import logger
from pyam.utils import isstr, cast_years_to_int
from pyam.utils import isstr, to_int

# %%

Expand Down Expand Up @@ -59,9 +59,8 @@ def cumulative(x, first_year, last_year):
.format(x.name or x, last_year))
return np.nan

# cast tiemseries colums to `int` if necessary
if not x.index.dtype == 'int64':
cast_years_to_int(x, index=True)
# make sure we're using integers
to_int(x, index=True)

x[first_year] = fill_series(x, first_year)
x[last_year] = fill_series(x, last_year)
Expand All @@ -74,7 +73,7 @@ def cumulative(x, first_year, last_year):
if not np.isnan(x[first_year]) and not np.isnan(x[last_year]):
value = 0
for (i, yr) in enumerate(years[:-1]):
next_yr = years[i+1]
next_yr = years[i + 1]
# the summation is shifted to include the first year fully in sum,
# otherwise, would return a weighted average of `yr` and `next_yr`
value += ((next_yr - yr - 1) * x[next_yr] +
Expand Down
17 changes: 6 additions & 11 deletions pyam/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import glob
import collections
import datetime
import dateutil
import time

import numpy as np
Expand Down Expand Up @@ -170,13 +171,14 @@ def format_data(df):
year_cols, time_cols, extra_cols = [], [], []
for i in cols:
try:
year_cols.append(i) if int(i) else None
int(i) # this is a year
year_cols.append(i)
except (ValueError, TypeError):
try:
pd.to_datetime([i])
dateutil.parser.parse(str(i)) # this is datetime
time_cols.append(i)
except ValueError:
extra_cols.append(i)
extra_cols.append(i) # some other string
if year_cols and not time_cols:
time_col = 'year'
melt_cols = year_cols
Expand All @@ -189,13 +191,6 @@ def format_data(df):
df = pd.melt(df, id_vars=IAMC_IDX + extra_cols, var_name=time_col,
value_vars=sorted(melt_cols), value_name='value')

# cast time_col to correct format
if time_col == 'year':
if not df.year.dtype == 'int64':
df['year'] = cast_years_to_int(pd.to_numeric(df['year']))
if time_col == 'time':
df['time'] = pd.to_datetime(df['time'])

# cast value columns to numeric, drop NaN's, sort data
df['value'] = df['value'].astype('float64')
df.dropna(inplace=True)
Expand Down Expand Up @@ -357,7 +352,7 @@ def datetime_match(data, dts):
return data.isin(dts)


def cast_years_to_int(x, index=False):
def to_int(x, index=False):
"""Formatting series or timeseries columns to int and checking validity.
If `index=False`, the function works on the `pd.Series x`; else,
the function casts the index of `x` to int and returns x with a new index.
Expand Down
65 changes: 65 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,71 @@ def test_init_df_with_extra_col(test_pd_df):
tdf, check_like=True)


def test_init_datetime(test_pd_df):
tdf = test_pd_df.copy()
tmin = datetime.datetime(2005, 6, 17)
tmax = datetime.datetime(2010, 6, 17)
tdf = tdf.rename(
{
2005: tmin,
2010: tmax,
},
axis="columns"
)

df = IamDataFrame(tdf)

assert df["time"].max() == tmax
assert df["time"].min() == tmin


@pytest.mark.xfail(reason=(
"pandas datetime is limited to the time period of ~1677-2262, see "
"https://stackoverflow.com/a/37226672"
))
def test_init_datetime_long_timespan(test_pd_df):
tdf = test_pd_df.copy()
tmin = datetime.datetime(2005, 6, 17)
tmax = datetime.datetime(3005, 6, 17)
tdf = tdf.rename(
{
2005: tmin,
2010: tmax,
},
axis="columns"
)

df = IamDataFrame(tdf)

assert df["time"].max() == tmax
assert df["time"].min() == tmin


def test_init_datetime_subclass_long_timespan(test_pd_df):
class TempSubClass(IamDataFrame):
def _format_datetime_col(self):
# the subclass does not try to coerce the datetimes to pandas datetimes,
znicholls marked this conversation as resolved.
Show resolved Hide resolved
# instead simply leaving the time column as object type, so we don't run
znicholls marked this conversation as resolved.
Show resolved Hide resolved
# into the problem of pandas limited time period as discussed in
# https://stackoverflow.com/a/37226672
pass

tdf = test_pd_df.copy()
tmin = datetime.datetime(2005, 6, 17)
tmax = datetime.datetime(3005, 6, 17)
tdf = tdf.rename(
{
2005: tmin,
2010: tmax,
},
axis="columns"
)

df = TempSubClass(tdf)

assert df["time"].max() == tmax
assert df["time"].min() == tmin


def test_to_excel(test_df):
fname = 'foo_testing.xlsx'
Expand Down
4 changes: 2 additions & 2 deletions tests/test_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import pandas as pd
from pyam.logger import logger
from pyam import fill_series, cumulative, cross_threshold, cast_years_to_int
from pyam import fill_series, cumulative, cross_threshold, to_int
import pytest


Expand All @@ -21,7 +21,7 @@ def test_fill_series_out_of_range():

def test_cols_to_int():
y = pd.Series(data=[np.nan, 1, 3, 1], index=[2002., 2007.5, 2003., 2013.])
pytest.raises(ValueError, cast_years_to_int, x=y)
pytest.raises(ValueError, to_int, x=y)


def test_cumulative():
Expand Down