diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 392a2bb1a..cf82791eb 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,6 +1,7 @@ # Next Release +- [#202](https://github.com/IAMconsortium/pyam/pull/202) Extend the `df.rename()` function with a `check_duplicates (default True)` validation option - [#199](https://github.com/IAMconsortium/pyam/pull/199) Initializing an `IamDataFrame` accepts kwargs to fill or create from the data any missing required columns - [#197](https://github.com/IAMconsortium/pyam/pull/197) Added a `normalize` function that normalizes all data in a data frame to a specific time period. - [#195](https://github.com/IAMconsortium/pyam/pull/195) Fix filtering for `time`, `day` and `hour` to use generic `pattern_match()` (if such a column exists) in 'year'-formmatted IamDataFrames diff --git a/pyam/core.py b/pyam/core.py index 634d75ccc..2bd170bda 100644 --- a/pyam/core.py +++ b/pyam/core.py @@ -531,7 +531,8 @@ def validate(self, criteria={}, exclude_on_fail=False): return df - def rename(self, mapping=None, inplace=False, append=False, **kwargs): + def rename(self, mapping=None, inplace=False, append=False, + check_duplicates=True, **kwargs): """Rename and aggregate column entries using `groupby.sum()` on values. When renaming models or scenarios, the uniqueness of the index must be maintained, and the function will raise an error otherwise. @@ -551,6 +552,10 @@ def rename(self, mapping=None, inplace=False, append=False, **kwargs): if True, do operation inplace and return None append: bool, default False if True, append renamed timeseries to IamDataFrame + check_duplicates: bool, default True + check whether conflict between existing and renamed data exists. + If True, raise ValueError; if False, rename and merge + with `groupby().sum()`. """ # combine `mapping` arg and mapping kwargs, ensure no rename conflicts mapping = mapping or {} @@ -560,12 +565,16 @@ def rename(self, mapping=None, inplace=False, append=False, **kwargs): raise ValueError(msg) mapping.update(kwargs) + # determine columns that are not `model` or `scenario` + data_cols = set(self._LONG_IDX) - set(META_IDX) + # changing index and data columns can cause model-scenario mismatch if any(i in mapping for i in META_IDX)\ - and any(i in mapping for i in ['region', 'variable', 'unit']): + and any(i in mapping for i in data_cols): msg = 'Renaming index and data cols simultaneously not supported!' raise ValueError(msg) + # translate rename mapping to `filter()` arguments filters = {col: _from.keys() for col, _from in mapping.items()} # if append is True, downselect and append renamed data @@ -581,6 +590,9 @@ def rename(self, mapping=None, inplace=False, append=False, **kwargs): rows = ret._apply_filters(filters) idx = ret.meta.index.isin(_make_index(ret.data[rows])) + # if `check_duplicates`, do the rename on a copy until after the check + _data = ret.data.copy() if check_duplicates else ret.data + # apply renaming changes for col, _mapping in mapping.items(): if col in META_IDX: @@ -590,11 +602,23 @@ def rename(self, mapping=None, inplace=False, append=False, **kwargs): raise ValueError('Renaming to non-unique `{}` index!' .format(col)) ret.meta.index = _index.set_index(META_IDX).index - elif col not in ['region', 'variable', 'unit']: + elif col not in data_cols: raise ValueError('Renaming by `{}` not supported!'.format(col)) - ret.data.loc[rows, col] = ret.data.loc[rows, col].replace(_mapping) + _data.loc[rows, col] = _data.loc[rows, col].replace(_mapping) + + # check if duplicates exist between the renamed and not-renamed data + if check_duplicates: + merged = ( + _data.loc[rows, self._LONG_IDX].drop_duplicates().append( + _data.loc[~rows, self._LONG_IDX].drop_duplicates()) + ) + if any(merged.duplicated()): + msg = 'Duplicated rows between original and renamed data!\n{}' + conflict_rows = merged.loc[merged.duplicated(), self._LONG_IDX] + raise ValueError(msg.format(conflict_rows.drop_duplicates())) - ret.data = ret.data.groupby(ret._LONG_IDX).sum().reset_index() + # merge using `groupby().sum()` + ret.data = _data.groupby(ret._LONG_IDX).sum().reset_index() if not inplace: return ret @@ -621,8 +645,8 @@ def convert_unit(self, conversion_mapping, inplace=False): return ret def normalize(self, inplace=False, **kwargs): - """Normalize data to a given value. Currently only supports normalizing to a - specific time + """Normalize data to a given value. Currently only supports normalizing + to a specific time. Parameters ---------- diff --git a/tests/test_feature_append_rename_convert.py b/tests/test_feature_append_rename_convert.py index 9a7c55d2c..391e0f225 100644 --- a/tests/test_feature_append_rename_convert.py +++ b/tests/test_feature_append_rename_convert.py @@ -6,7 +6,7 @@ from numpy import testing as npt -from pyam import IamDataFrame, META_IDX, IAMC_IDX +from pyam import IamDataFrame, META_IDX, IAMC_IDX, compare RENAME_DF = IamDataFrame(pd.DataFrame([ @@ -181,6 +181,24 @@ def test_rename_append(meta_df): pd.testing.assert_frame_equal(obs.meta, exp) +def test_rename_duplicates(): + mapping = {'variable': {'test_1': 'test_3'}} + pytest.raises(ValueError, RENAME_DF.rename, **mapping) + + obs = RENAME_DF.rename(check_duplicates=False, **mapping) + + exp = IamDataFrame(pd.DataFrame([ + ['model', 'scen', 'region_a', 'test_2', 'unit', 2, 6], + ['model', 'scen', 'region_a', 'test_3', 'unit', 4, 12], + ['model', 'scen', 'region_b', 'test_3', 'unit', 4, 8], + ], columns=['model', 'scenario', 'region', + 'variable', 'unit', 2005, 2010], + )) + + assert compare(obs, exp).empty + pd.testing.assert_frame_equal(obs.data, exp.data) + + def test_convert_unit(): df = IamDataFrame(pd.DataFrame([ ['model', 'scen', 'SST', 'test_1', 'A', 1, 5],