IAMconsortium · gidden · Jan 25, 2019 · Jan 24, 2019 · Jan 24, 2019 · Jan 25, 2019
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -1,8 +1,9 @@
 
 # Next Release
 
+- [#177](https://github.com/IAMconsortium/pyam/pull/177) Modified formatting of time column on init to allow subclasses to avoid pandas limitation (https://stackoverflow.com/a/37226672)
 - [#176](https://github.com/IAMconsortium/pyam/pull/176) Corrected title setting operation in line_plot function
-- [#175](https://github.com/IAMconsortium/pyam/pull/175) Update link to tutorial in readme.md 
+- [#175](https://github.com/IAMconsortium/pyam/pull/175) Update link to tutorial in readme.md
 - [#174](https://github.com/IAMconsortium/pyam/pull/174) Add a function `difference()` to compare two IamDataFrames
 - [#171](https://github.com/IAMconsortium/pyam/pull/171) Fix a bug when reading from an `ixmp.TimeSeries` object, refactor to mitigate circular dependency
 - [#162](https://github.com/IAMconsortium/pyam/pull/162) Add a function to sum and append timeseries components to an aggregate variable

diff --git a/pyam/core.py b/pyam/core.py
@@ -23,6 +23,7 @@
     read_files,
     read_pandas,
     format_data,
+    to_int,
     pattern_match,
     years_match,
     month_match,
@@ -72,7 +73,14 @@ def __init__(self, data, **kwargs):
             _data = read_ix(data, **kwargs)
         else:
             _data = read_files(data, **kwargs)
+
         self.data, self.time_col, self.extra_cols = _data
+        # cast time_col to desired format
+        if self.time_col == 'year':
+            self._format_year_col()
+        elif self.time_col == 'time':
+            self._format_datetime_col()
+
         self._LONG_IDX = IAMC_IDX + [self.time_col] + self.extra_cols
 
         # define a dataframe for categorization and other metadata indicators
@@ -83,6 +91,12 @@ def __init__(self, data, **kwargs):
         if 'exec' in run_control():
             self._execute_run_control()
 
+    def _format_year_col(self):
+        self.data['year'] = to_int(pd.to_numeric(self.data['year']))
+
+    def _format_datetime_col(self):
+        self.data['time'] = pd.to_datetime(self.data['time'])
+
     def __getitem__(self, key):
         _key_check = [key] if isstr(key) else key
         if set(_key_check).issubset(self.meta.columns):
@@ -890,7 +904,6 @@ def _apply_filters(self, filters):
 
         return keep
 
-
     def col_apply(self, col, func, *args, **kwargs):
         """Apply a function to a column
 

diff --git a/pyam/timeseries.py b/pyam/timeseries.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 from pyam.logger import logger
-from pyam.utils import isstr, cast_years_to_int
+from pyam.utils import isstr, to_int
 
 # %%
 
@@ -59,9 +59,8 @@ def cumulative(x, first_year, last_year):
                          .format(x.name or x, last_year))
         return np.nan
 
-    # cast tiemseries colums to `int` if necessary
-    if not x.index.dtype == 'int64':
-        cast_years_to_int(x, index=True)
+    # make sure we're using integers
+    to_int(x, index=True)
 
     x[first_year] = fill_series(x, first_year)
     x[last_year] = fill_series(x, last_year)
@@ -74,7 +73,7 @@ def cumulative(x, first_year, last_year):
     if not np.isnan(x[first_year]) and not np.isnan(x[last_year]):
         value = 0
         for (i, yr) in enumerate(years[:-1]):
-            next_yr = years[i+1]
+            next_yr = years[i + 1]
             # the summation is shifted to include the first year fully in sum,
             # otherwise, would return a weighted average of `yr` and `next_yr`
             value += ((next_yr - yr - 1) * x[next_yr] +

diff --git a/pyam/utils.py b/pyam/utils.py
@@ -6,6 +6,7 @@
 import glob
 import collections
 import datetime
+import dateutil
 import time
 
 import numpy as np
@@ -170,13 +171,14 @@ def format_data(df):
         year_cols, time_cols, extra_cols = [], [], []
         for i in cols:
             try:
-                year_cols.append(i) if int(i) else None
+                int(i)  # this is a year
+                year_cols.append(i)
             except (ValueError, TypeError):
                 try:
-                    pd.to_datetime([i])
+                    dateutil.parser.parse(str(i))  # this is datetime
                     time_cols.append(i)
                 except ValueError:
-                    extra_cols.append(i)
+                    extra_cols.append(i)  # some other string
         if year_cols and not time_cols:
             time_col = 'year'
             melt_cols = year_cols
@@ -189,13 +191,6 @@ def format_data(df):
         df = pd.melt(df, id_vars=IAMC_IDX + extra_cols, var_name=time_col,
                      value_vars=sorted(melt_cols), value_name='value')
 
-    # cast time_col to correct format
-    if time_col == 'year':
-        if not df.year.dtype == 'int64':
-            df['year'] = cast_years_to_int(pd.to_numeric(df['year']))
-    if time_col == 'time':
-        df['time'] = pd.to_datetime(df['time'])
-
     # cast value columns to numeric, drop NaN's, sort data
     df['value'] = df['value'].astype('float64')
     df.dropna(inplace=True)
@@ -357,7 +352,7 @@ def datetime_match(data, dts):
     return data.isin(dts)
 
 
-def cast_years_to_int(x, index=False):
+def to_int(x, index=False):
     """Formatting series or timeseries columns to int and checking validity.
     If `index=False`, the function works on the `pd.Series x`; else,
     the function casts the index of `x` to int and returns x with a new index.

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -65,6 +65,71 @@ def test_init_df_with_extra_col(test_pd_df):
                                   tdf, check_like=True)
 
 
+def test_init_datetime(test_pd_df):
+    tdf = test_pd_df.copy()
+    tmin = datetime.datetime(2005, 6, 17)
+    tmax = datetime.datetime(2010, 6, 17)
+    tdf = tdf.rename(
+        {
+            2005: tmin,
+            2010: tmax,
+        },
+        axis="columns"
+    )
+
+    df = IamDataFrame(tdf)
+
+    assert df["time"].max() == tmax
+    assert df["time"].min() == tmin
+
+
+@pytest.mark.xfail(reason=(
+    "pandas datetime is limited to the time period of ~1677-2262, see "
+    "https://stackoverflow.com/a/37226672"
+))
+def test_init_datetime_long_timespan(test_pd_df):
+    tdf = test_pd_df.copy()
+    tmin = datetime.datetime(2005, 6, 17)
+    tmax = datetime.datetime(3005, 6, 17)
+    tdf = tdf.rename(
+        {
+            2005: tmin,
+            2010: tmax,
+        },
+        axis="columns"
+    )
+
+    df = IamDataFrame(tdf)
+
+    assert df["time"].max() == tmax
+    assert df["time"].min() == tmin
+
+
+def test_init_datetime_subclass_long_timespan(test_pd_df):
+    class TempSubClass(IamDataFrame):
+        def _format_datetime_col(self):
+            # the subclass does not try to coerce the datetimes to pandas datetimes,
+            # instead simply leaving the time column as object type, so we don't run
+            # into the problem of pandas limited time period as discussed in
+            # https://stackoverflow.com/a/37226672
+            pass
+
+    tdf = test_pd_df.copy()
+    tmin = datetime.datetime(2005, 6, 17)
+    tmax = datetime.datetime(3005, 6, 17)
+    tdf = tdf.rename(
+        {
+            2005: tmin,
+            2010: tmax,
+        },
+        axis="columns"
+    )
+
+    df = TempSubClass(tdf)
+
+    assert df["time"].max() == tmax
+    assert df["time"].min() == tmin
+
 
 def test_to_excel(test_df):
     fname = 'foo_testing.xlsx'

diff --git a/tests/test_timeseries.py b/tests/test_timeseries.py
@@ -4,7 +4,7 @@
 import numpy as np
 import pandas as pd
 from pyam.logger import logger
-from pyam import fill_series, cumulative, cross_threshold, cast_years_to_int
+from pyam import fill_series, cumulative, cross_threshold, to_int
 import pytest
 
 
@@ -21,7 +21,7 @@ def test_fill_series_out_of_range():
 
 def test_cols_to_int():
     y = pd.Series(data=[np.nan, 1, 3, 1], index=[2002., 2007.5, 2003., 2013.])
-    pytest.raises(ValueError, cast_years_to_int, x=y)
+    pytest.raises(ValueError, to_int, x=y)
 
 
 def test_cumulative():