From 10cf203f4b52a25e0fc4b1d4a62f13138d3b6046 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Fri, 13 Dec 2019 11:15:08 +0100
Subject: [PATCH 01/26] use full `TEST_DF` for unit tests

---
 tests/conftest.py  | 3 +--
 tests/test_core.py | 5 +++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index b70dd29bb..03c80ae6f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -189,8 +189,7 @@
     ]
 )
 def test_df(request):
-    tdf = TEST_DF.iloc[:2]
-    tdf = tdf.rename({2005: request.param[0], 2010: request.param[1]},
+    tdf = TEST_DF.rename({2005: request.param[0], 2010: request.param[1]},
                      axis="columns")
     df = IamDataFrame(data=tdf)
     yield df
diff --git a/tests/test_core.py b/tests/test_core.py
index edd723ad1..73bbd6b8e 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -167,7 +167,7 @@ def test_model(test_df):
 
 
 def test_scenario(test_df):
-    exp = pd.Series(data=['scen_a'], name='scenario')
+    exp = pd.Series(data=['scen_a', 'scen_b'], name='scenario')
     pd.testing.assert_series_equal(test_df.scenarios(), exp)
 
 
@@ -478,7 +478,8 @@ def test_timeseries(test_df):
            'years': [2005, 2010], 'value': [1, 6]}
     exp = pd.DataFrame(dct).pivot_table(index=['model', 'scenario'],
                                         columns=['years'], values='value')
-    obs = test_df.filter(variable='Primary Energy').timeseries()
+    obs = test_df.filter(scenario='scen_a',
+                         variable='Primary Energy').timeseries()
     npt.assert_array_equal(obs, exp)
 
 

From 85afde44255ebbb0e43f80eaf0275a96fbecbba3 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Fri, 13 Dec 2019 11:52:47 +0100
Subject: [PATCH 02/26] use full `TEST_DF` for unit tests specific for 'year'
 feature

---
 tests/conftest.py  |  7 +++++--
 tests/test_core.py | 14 +++++---------
 tests/test_io.py   |  8 ++++++--
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 03c80ae6f..77a16da43 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -179,6 +179,7 @@
 TEST_DTS = [datetime(2005, 6, 17), datetime(2010, 7, 21)]
 
 
+# IamDataFrame with four different time formats
 @pytest.fixture(
     scope="function",
     params=[
@@ -190,17 +191,19 @@
 )
 def test_df(request):
     tdf = TEST_DF.rename({2005: request.param[0], 2010: request.param[1]},
-                     axis="columns")
+                         axis="columns")
     df = IamDataFrame(data=tdf)
     yield df
 
 
+# IamDataFrame for testing specifically for 'year'-column feature
 @pytest.fixture(scope="function")
 def test_df_year():
-    df = IamDataFrame(data=TEST_DF.iloc[:2])
+    df = IamDataFrame(data=TEST_DF)
     yield df
 
 
+# standard test data as pandas.DataFrame (only 'year' time format)
 @pytest.fixture(scope="function")
 def test_pd_df():
     yield TEST_DF.copy()
diff --git a/tests/test_core.py b/tests/test_core.py
index 73bbd6b8e..d9f3c0a7a 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -634,17 +634,13 @@ def test_category_top_level(meta_df):
 
 def test_interpolate(test_df_year):
     test_df_year.interpolate(2007)
-    dct = {'model': ['a_model'] * 3, 'scenario': ['a_scenario'] * 3,
-           'years': [2005, 2007, 2010], 'value': [1, 3, 6]}
-    exp = pd.DataFrame(dct).pivot_table(index=['model', 'scenario'],
-                                        columns=['years'], values='value')
-    variable = {'variable': 'Primary Energy'}
-    obs = test_df_year.filter(**variable).timeseries()
-    npt.assert_array_equal(obs, exp)
+    obs = test_df_year.filter(year=2007).data['value'].reset_index(drop=True)
+    exp = pd.Series([3, 1.5, 4], name='value')
+    pd.testing.assert_series_equal(obs, exp)
 
-    # redo the inpolation and check that no duplicates are added
+    # redo the interpolation and check that no duplicates are added
     test_df_year.interpolate(2007)
-    assert not test_df_year.filter(**variable).data.duplicated().any()
+    assert not test_df_year.filter().data.duplicated().any()
 
 
 def test_filter_by_bool(meta_df):
diff --git a/tests/test_io.py b/tests/test_io.py
index 2fd585350..3c4eeaed2 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -7,6 +7,8 @@
 
 from conftest import TEST_DATA_DIR
 
+FILTER_ARGS = dict(scenario='scen_a')
+
 
 def test_io_csv(meta_df):
     # write to csv
@@ -59,14 +61,16 @@ def test_load_meta(meta_df, args):
 
 
 def test_load_ssp_database_downloaded_file(test_df_year):
+    exp = test_df_year.filter(**FILTER_ARGS).as_pandas()
     obs_df = IamDataFrame(os.path.join(
         TEST_DATA_DIR, 'test_SSP_database_raw_download.xlsx')
     )
-    pd.testing.assert_frame_equal(obs_df.as_pandas(), test_df_year.as_pandas())
+    pd.testing.assert_frame_equal(obs_df.as_pandas(), exp)
 
 
 def test_load_rcp_database_downloaded_file(test_df_year):
+    exp = test_df_year.filter(**FILTER_ARGS).as_pandas()
     obs_df = IamDataFrame(os.path.join(
         TEST_DATA_DIR, 'test_RCP_database_raw_download.xlsx')
     )
-    pd.testing.assert_frame_equal(obs_df.as_pandas(), test_df_year.as_pandas())
+    pd.testing.assert_frame_equal(obs_df.as_pandas(), exp)

From 08b894acab1b68588d21b63ff00b6925c3e1941e Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Fri, 13 Dec 2019 16:52:14 +0100
Subject: [PATCH 03/26] replace `meta_df` by `test_df` across all tests

---
 tests/conftest.py                           |  20 +--
 tests/test_cast_to_iamc.py                  |  59 +++----
 tests/test_core.py                          | 172 ++++++++++----------
 tests/test_feature_aggregate.py             |  24 +--
 tests/test_feature_append_rename_convert.py |  58 +++----
 tests/test_feature_compare.py               |  16 +-
 tests/test_feature_set_meta.py              |  92 +++++------
 tests/test_io.py                            |  22 +--
 8 files changed, 224 insertions(+), 239 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 77a16da43..42e61c0a3 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -177,6 +177,8 @@
 
 TEST_YEARS = [2005, 2010]
 TEST_DTS = [datetime(2005, 6, 17), datetime(2010, 7, 21)]
+TEST_TIME_STR = ['2005-06-17', '2010-07-21']
+TEST_TIME_STR_HR = ['2005-06-17 00:00:00', '2010-07-21 12:00:00']
 
 
 # IamDataFrame with four different time formats
@@ -185,8 +187,8 @@
     params=[
         TEST_YEARS,
         TEST_DTS,
-        ['2005-06-17', '2010-07-21'],
-        ['2005-06-17 00:00:00', '2010-07-21 12:00:00']
+        TEST_TIME_STR,
+        TEST_TIME_STR_HR
     ]
 )
 def test_df(request):
@@ -209,20 +211,6 @@ def test_pd_df():
     yield TEST_DF.copy()
 
 
-@pytest.fixture(
-    scope="function",
-    params=[
-        TEST_YEARS,
-        TEST_DTS,
-    ]
-)
-def meta_df(request):
-    mdf = TEST_DF.rename({2005: request.param[0], 2010: request.param[1]},
-                         axis="columns")
-    df = IamDataFrame(data=mdf)
-    yield df
-
-
 @pytest.fixture(scope="function")
 def check_aggregate_df():
     df = IamDataFrame(data=CHECK_AGG_DF)
diff --git a/tests/test_cast_to_iamc.py b/tests/test_cast_to_iamc.py
index 3e9e7602f..522b0d827 100644
--- a/tests/test_cast_to_iamc.py
+++ b/tests/test_cast_to_iamc.py
@@ -4,43 +4,39 @@
 
 from conftest import TEST_DTS
 
-def test_cast_from_value_col(meta_df):
+def test_cast_from_value_col(test_df_year):
     df_with_value_cols = pd.DataFrame([
-        ['model_a', 'scen_a', 'World', 'EJ/y', TEST_DTS[0], 1, 0.5],
-        ['model_a', 'scen_a', 'World', 'EJ/y', TEST_DTS[1], 6., 3],
-        ['model_a', 'scen_b', 'World', 'EJ/y', TEST_DTS[0], 2, None],
-        ['model_a', 'scen_b', 'World', 'EJ/y', TEST_DTS[1], 7, None]
+        ['model_a', 'scen_a', 'World', 'EJ/y', 2005, 1, 0.5],
+        ['model_a', 'scen_a', 'World', 'EJ/y', 2010, 6., 3],
+        ['model_a', 'scen_b', 'World', 'EJ/y', 2005, 2, None],
+        ['model_a', 'scen_b', 'World', 'EJ/y', 2010, 7, None]
     ],
-        columns=['model', 'scenario', 'region', 'unit', 'time',
+        columns=['model', 'scenario', 'region', 'unit', 'year',
                  'Primary Energy', 'Primary Energy|Coal'],
     )
     df = IamDataFrame(df_with_value_cols,
                       value=['Primary Energy', 'Primary Energy|Coal'])
-    if "year" in meta_df.data.columns:
-        df = df.swap_time_for_year()
 
-    assert compare(meta_df, df).empty
-    pd.testing.assert_frame_equal(df.data, meta_df.data, check_like=True)
+    assert compare(test_df_year, df).empty
+    pd.testing.assert_frame_equal(df.data, test_df_year.data, check_like=True)
 
 
-def test_cast_from_value_col_and_args(meta_df):
+def test_cast_from_value_col_and_args(test_df_year):
     # checks for issue [#210](https://github.com/IAMconsortium/pyam/issues/210)
     df_with_value_cols = pd.DataFrame([
-        ['scen_a', 'World', 'EJ/y', TEST_DTS[0], 1, 0.5],
-        ['scen_a', 'World', 'EJ/y', TEST_DTS[1], 6., 3],
-        ['scen_b', 'World', 'EJ/y', TEST_DTS[0], 2, None],
-        ['scen_b', 'World', 'EJ/y', TEST_DTS[1], 7, None]
+        ['scen_a', 'World', 'EJ/y', 2005, 1, 0.5],
+        ['scen_a', 'World', 'EJ/y', 2010, 6., 3],
+        ['scen_b', 'World', 'EJ/y', 2005, 2, None],
+        ['scen_b', 'World', 'EJ/y', 2010, 7, None]
     ],
-        columns=['scenario', 'iso', 'unit', 'time',
+        columns=['scenario', 'iso', 'unit', 'year',
                  'Primary Energy', 'Primary Energy|Coal'],
     )
     df = IamDataFrame(df_with_value_cols, model='model_a', region='iso',
                       value=['Primary Energy', 'Primary Energy|Coal'])
-    if "year" in meta_df.data.columns:
-        df = df.swap_time_for_year()
 
-    assert compare(meta_df, df).empty
-    pd.testing.assert_frame_equal(df.data, meta_df.data, check_like=True)
+    assert compare(test_df_year, df).empty
+    pd.testing.assert_frame_equal(df.data, test_df_year.data, check_like=True)
 
 
 def test_cast_with_model_arg_raises():
@@ -53,35 +49,32 @@ def test_cast_with_model_arg_raises():
     pytest.raises(ValueError, IamDataFrame, df, model='foo')
 
 
-def test_cast_with_model_arg(meta_df):
-    df = meta_df.timeseries().reset_index()
+def test_cast_with_model_arg(test_df):
+    df = test_df.timeseries().reset_index()
     df.rename(columns={'model': 'foo'}, inplace=True)
 
     df = IamDataFrame(df, model='foo')
-    assert compare(meta_df, df).empty
-    pd.testing.assert_frame_equal(df.data, meta_df.data)
+    assert compare(test_df, df).empty
+    pd.testing.assert_frame_equal(df.data, test_df.data)
 
 
-def test_cast_by_column_concat(meta_df):
-    dts = TEST_DTS
+def test_cast_by_column_concat(test_df_year):
     df = pd.DataFrame([
         ['scen_a', 'World', 'Primary Energy', None, 'EJ/y', 1, 6.],
         ['scen_a', 'World', 'Primary Energy', 'Coal', 'EJ/y', 0.5, 3],
         ['scen_b', 'World', 'Primary Energy', None, 'EJ/y', 2, 7],
     ],
-        columns=['scenario', 'region', 'var_1', 'var_2', 'unit'] + dts,
+        columns=['scenario', 'region', 'var_1', 'var_2', 'unit', 2005, 2010],
     )
 
     df = IamDataFrame(df, model='model_a', variable=['var_1', 'var_2'])
-    if "year" in meta_df.data.columns:
-        df = df.swap_time_for_year()
 
-    assert compare(meta_df, df).empty
-    pd.testing.assert_frame_equal(df.data, meta_df.data, check_like=True)
+    assert compare(test_df_year, df).empty
+    pd.testing.assert_frame_equal(df.data, test_df_year.data, check_like=True)
 
 
-def test_cast_with_variable_and_value(meta_df):
-    pe_df = meta_df.filter(variable='Primary Energy')
+def test_cast_with_variable_and_value(test_df):
+    pe_df = test_df.filter(variable='Primary Energy')
     df = pe_df.data.rename(columns={'value': 'lvl'}).drop('variable', axis=1)
 
     df = IamDataFrame(df, variable='Primary Energy', value='lvl')
diff --git a/tests/test_core.py b/tests/test_core.py
index d9f3c0a7a..5afe7ec48 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -457,19 +457,19 @@ def test_filter_year_with_time_col(test_pd_df):
     pd.testing.assert_frame_equal(obs, exp[0:2])
 
 
-def test_filter_as_kwarg(meta_df):
-    obs = list(meta_df.filter(variable='Primary Energy|Coal').scenarios())
+def test_filter_as_kwarg(test_df):
+    obs = list(test_df.filter(variable='Primary Energy|Coal').scenarios())
     assert obs == ['scen_a']
 
 
-def test_filter_keep_false(meta_df):
-    df = meta_df.filter(variable='Primary Energy|Coal', year=2005, keep=False)
+def test_filter_keep_false(test_df):
+    df = test_df.filter(variable='Primary Energy|Coal', year=2005, keep=False)
     obs = df.data[df.data.scenario == 'scen_a'].value
     npt.assert_array_equal(obs, [1, 6, 3])
 
 
-def test_filter_by_regexp(meta_df):
-    obs = meta_df.filter(scenario='sce._a$', regexp=True)
+def test_filter_by_regexp(test_df):
+    obs = test_df.filter(scenario='sce._a$', regexp=True)
     assert obs['scenario'].unique() == 'scen_a'
 
 
@@ -488,147 +488,149 @@ def test_timeseries_raises(test_df_year):
     pytest.raises(ValueError, _df.timeseries)
 
 
-def test_filter_meta_index(meta_df):
-    obs = meta_df.filter(scenario='scen_b').meta.index
+def test_filter_meta_index(test_df):
+    obs = test_df.filter(scenario='scen_b').meta.index
     exp = pd.MultiIndex(levels=[['model_a'], ['scen_b']],
                         labels=[[0], [0]],
                         names=['model', 'scenario'])
     pd.testing.assert_index_equal(obs, exp)
 
 
-def test_meta_idx(meta_df):
+def test_meta_idx(test_df):
     # assert that the `drop_duplicates()` in `_meta_idx()` returns right length
-    assert len(_meta_idx(meta_df.data)) == 2
+    assert len(_meta_idx(test_df.data)) == 2
 
 
-def test_require_variable(meta_df):
-    obs = meta_df.require_variable(variable='Primary Energy|Coal',
+def test_require_variable(test_df):
+    obs = test_df.require_variable(variable='Primary Energy|Coal',
                                    exclude_on_fail=True)
     assert len(obs) == 1
     assert obs.loc[0, 'scenario'] == 'scen_b'
 
-    assert list(meta_df['exclude']) == [False, True]
+    assert list(test_df['exclude']) == [False, True]
 
 
-def test_require_variable_top_level(meta_df):
-    obs = require_variable(meta_df, variable='Primary Energy|Coal',
+def test_require_variable_top_level(test_df):
+    obs = require_variable(test_df, variable='Primary Energy|Coal',
                            exclude_on_fail=True)
     assert len(obs) == 1
     assert obs.loc[0, 'scenario'] == 'scen_b'
 
-    assert list(meta_df['exclude']) == [False, True]
+    assert list(test_df['exclude']) == [False, True]
 
 
-def test_validate_all_pass(meta_df):
-    obs = meta_df.validate(
+def test_validate_all_pass(test_df):
+    obs = test_df.validate(
         {'Primary Energy': {'up': 10}}, exclude_on_fail=True)
     assert obs is None
-    assert len(meta_df.data) == 6  # data unchanged
+    assert len(test_df.data) == 6  # data unchanged
 
-    assert list(meta_df['exclude']) == [False, False]  # none excluded
+    assert list(test_df['exclude']) == [False, False]  # none excluded
 
 
-def test_validate_nonexisting(meta_df):
-    obs = meta_df.validate({'Primary Energy|Coal': {'up': 2}},
+def test_validate_nonexisting(test_df):
+    obs = test_df.validate({'Primary Energy|Coal': {'up': 2}},
                            exclude_on_fail=True)
     assert len(obs) == 1
     assert obs['scenario'].values[0] == 'scen_a'
 
-    assert list(meta_df['exclude']) == [True, False]  # scenario with failed
+    assert list(test_df['exclude']) == [True, False]  # scenario with failed
     # validation excluded, scenario with non-defined value passes validation
 
 
-def test_validate_up(meta_df):
-    obs = meta_df.validate({'Primary Energy': {'up': 6.5}},
+def test_validate_up(test_df):
+    obs = test_df.validate({'Primary Energy': {'up': 6.5}},
                            exclude_on_fail=False)
     assert len(obs) == 1
-    if 'year' in meta_df.data:
+    if 'year' in test_df.data:
         assert obs['year'].values[0] == 2010
     else:
         exp_time = pd.to_datetime(datetime.datetime(2010, 7, 21))
-        assert pd.to_datetime(obs['time'].values[0]) == exp_time
+        print(exp_time)
+        assert pd.to_datetime(obs['time'].values[0]).date() == exp_time
 
-    assert list(meta_df['exclude']) == [False, False]  # assert none excluded
+    assert list(test_df['exclude']) == [False, False]  # assert none excluded
 
 
-def test_validate_lo(meta_df):
-    obs = meta_df.validate({'Primary Energy': {'up': 8, 'lo': 2.0}})
+def test_validate_lo(test_df):
+    obs = test_df.validate({'Primary Energy': {'up': 8, 'lo': 2.0}})
     assert len(obs) == 1
-    if 'year' in meta_df.data:
+    if 'year' in test_df.data:
         assert obs['year'].values[0] == 2005
     else:
         exp_year = pd.to_datetime(datetime.datetime(2005, 6, 17))
-        assert pd.to_datetime(obs['time'].values[0]) == exp_year
+        assert pd.to_datetime(obs['time'].values[0]).date() == exp_year
 
     assert list(obs['scenario'].values) == ['scen_a']
 
 
-def test_validate_both(meta_df):
-    obs = meta_df.validate({'Primary Energy': {'up': 6.5, 'lo': 2.0}})
+def test_validate_both(test_df):
+    obs = test_df.validate({'Primary Energy': {'up': 6.5, 'lo': 2.0}})
     assert len(obs) == 2
-    if 'year' in meta_df.data:
+    if 'year' in test_df.data:
         assert list(obs['year'].values) == [2005, 2010]
     else:
         exp_time = pd.to_datetime(TEST_DTS)
+        obs.time = obs.time.dt.normalize()
         assert (pd.to_datetime(obs['time'].values) == exp_time).all()
 
     assert list(obs['scenario'].values) == ['scen_a', 'scen_b']
 
 
-def test_validate_year(meta_df):
-    obs = meta_df.validate({'Primary Energy': {'up': 5.0, 'year': 2005}},
+def test_validate_year(test_df):
+    obs = test_df.validate({'Primary Energy': {'up': 5.0, 'year': 2005}},
                            exclude_on_fail=False)
     assert obs is None
 
-    obs = meta_df.validate({'Primary Energy': {'up': 5.0, 'year': 2010}},
+    obs = test_df.validate({'Primary Energy': {'up': 5.0, 'year': 2010}},
                            exclude_on_fail=False)
     assert len(obs) == 2
 
 
-def test_validate_exclude(meta_df):
-    meta_df.validate({'Primary Energy': {'up': 6.0}}, exclude_on_fail=True)
-    assert list(meta_df['exclude']) == [False, True]
+def test_validate_exclude(test_df):
+    test_df.validate({'Primary Energy': {'up': 6.0}}, exclude_on_fail=True)
+    assert list(test_df['exclude']) == [False, True]
 
 
-def test_validate_top_level(meta_df):
-    obs = validate(meta_df, criteria={'Primary Energy': {'up': 6.0}},
+def test_validate_top_level(test_df):
+    obs = validate(test_df, criteria={'Primary Energy': {'up': 6.0}},
                    exclude_on_fail=True, variable='Primary Energy')
     assert len(obs) == 1
-    if 'year' in meta_df.data:
+    if 'year' in test_df.data:
         assert obs['year'].values[0] == 2010
     else:
         exp_time = pd.to_datetime(datetime.datetime(2010, 7, 21))
-        assert (pd.to_datetime(obs['time'].values[0]) == exp_time)
-    assert list(meta_df['exclude']) == [False, True]
+        assert (pd.to_datetime(obs['time'].values[0]).date() == exp_time)
+    assert list(test_df['exclude']) == [False, True]
 
 
-def test_category_none(meta_df):
-    meta_df.categorize('category', 'Testing', {'Primary Energy': {'up': 0.8}})
-    assert 'category' not in meta_df.meta.columns
+def test_category_none(test_df):
+    test_df.categorize('category', 'Testing', {'Primary Energy': {'up': 0.8}})
+    assert 'category' not in test_df.meta.columns
 
 
-def test_category_pass(meta_df):
+def test_category_pass(test_df):
     dct = {'model': ['model_a', 'model_a'],
            'scenario': ['scen_a', 'scen_b'],
            'category': ['foo', None]}
     exp = pd.DataFrame(dct).set_index(['model', 'scenario'])['category']
 
-    meta_df.categorize('category', 'foo', {'Primary Energy':
+    test_df.categorize('category', 'foo', {'Primary Energy':
                                            {'up': 6, 'year': 2010}})
-    obs = meta_df['category']
+    obs = test_df['category']
     pd.testing.assert_series_equal(obs, exp)
 
 
-def test_category_top_level(meta_df):
+def test_category_top_level(test_df):
     dct = {'model': ['model_a', 'model_a'],
            'scenario': ['scen_a', 'scen_b'],
            'category': ['foo', None]}
     exp = pd.DataFrame(dct).set_index(['model', 'scenario'])['category']
 
-    categorize(meta_df, 'category', 'foo',
+    categorize(test_df, 'category', 'foo',
                criteria={'Primary Energy': {'up': 6, 'year': 2010}},
                variable='Primary Energy')
-    obs = meta_df['category']
+    obs = test_df['category']
     pd.testing.assert_series_equal(obs, exp)
 
 
@@ -643,15 +645,15 @@ def test_interpolate(test_df_year):
     assert not test_df_year.filter().data.duplicated().any()
 
 
-def test_filter_by_bool(meta_df):
-    meta_df.set_meta([True, False], name='exclude')
-    obs = meta_df.filter(exclude=True)
+def test_filter_by_bool(test_df):
+    test_df.set_meta([True, False], name='exclude')
+    obs = test_df.filter(exclude=True)
     assert obs['scenario'].unique() == 'scen_a'
 
 
-def test_filter_by_int(meta_df):
-    meta_df.set_meta([1, 2], name='test')
-    obs = meta_df.filter(test=[1, 3])
+def test_filter_by_int(test_df):
+    test_df.set_meta([1, 2], name='test')
+    obs = test_df.filter(test=[1, 3])
     assert obs['scenario'].unique() == 'scen_a'
 
 
@@ -762,13 +764,13 @@ def test_48c():
     pd.testing.assert_frame_equal(obs, exp, check_index_type=False)
 
 
-def test_pd_filter_by_meta(meta_df):
+def test_pd_filter_by_meta(test_df):
     data = df_filter_by_meta_matching_idx.set_index(['model', 'region'])
 
-    meta_df.set_meta([True, False], 'boolean')
-    meta_df.set_meta(0, 'integer')
+    test_df.set_meta([True, False], 'boolean')
+    test_df.set_meta(0, 'integer')
 
-    obs = filter_by_meta(data, meta_df, join_meta=True,
+    obs = filter_by_meta(data, test_df, join_meta=True,
                          boolean=True, integer=None)
     obs = obs.reindex(columns=['scenario', 'col', 'boolean', 'integer'])
 
@@ -779,13 +781,13 @@ def test_pd_filter_by_meta(meta_df):
     pd.testing.assert_frame_equal(obs, exp)
 
 
-def test_pd_filter_by_meta_no_index(meta_df):
+def test_pd_filter_by_meta_no_index(test_df):
     data = df_filter_by_meta_matching_idx
 
-    meta_df.set_meta([True, False], 'boolean')
-    meta_df.set_meta(0, 'int')
+    test_df.set_meta([True, False], 'boolean')
+    test_df.set_meta(0, 'int')
 
-    obs = filter_by_meta(data, meta_df, join_meta=True,
+    obs = filter_by_meta(data, test_df, join_meta=True,
                          boolean=True, int=None)
     obs = obs.reindex(columns=META_IDX + ['region', 'col', 'boolean', 'int'])
 
@@ -796,11 +798,11 @@ def test_pd_filter_by_meta_no_index(meta_df):
     pd.testing.assert_frame_equal(obs, exp)
 
 
-def test_pd_filter_by_meta_nonmatching_index(meta_df):
+def test_pd_filter_by_meta_nonmatching_index(test_df):
     data = df_filter_by_meta_nonmatching_idx
-    meta_df.set_meta(['a', 'b'], 'string')
+    test_df.set_meta(['a', 'b'], 'string')
 
-    obs = filter_by_meta(data, meta_df, join_meta=True, string='b')
+    obs = filter_by_meta(data, test_df, join_meta=True, string='b')
     obs = obs.reindex(columns=['scenario', 2010, 2020, 'string'])
 
     exp = data.iloc[2:3].copy()
@@ -809,11 +811,11 @@ def test_pd_filter_by_meta_nonmatching_index(meta_df):
     pd.testing.assert_frame_equal(obs, exp)
 
 
-def test_pd_join_by_meta_nonmatching_index(meta_df):
+def test_pd_join_by_meta_nonmatching_index(test_df):
     data = df_filter_by_meta_nonmatching_idx
-    meta_df.set_meta(['a', 'b'], 'string')
+    test_df.set_meta(['a', 'b'], 'string')
 
-    obs = filter_by_meta(data, meta_df, join_meta=True, string=None)
+    obs = filter_by_meta(data, test_df, join_meta=True, string=None)
     obs = obs.reindex(columns=['scenario', 2010, 2020, 'string'])
 
     exp = data.copy()
@@ -830,8 +832,8 @@ def test_concat_fails_notdf():
     pytest.raises(TypeError, concat, 'foo')
 
 
-def test_concat(meta_df):
-    left = IamDataFrame(meta_df.data.copy())
+def test_concat(test_df):
+    left = IamDataFrame(test_df.data.copy())
     right = left.data.copy()
     right['model'] = 'not left'
     right = IamDataFrame(right)
@@ -847,22 +849,22 @@ def test_concat(meta_df):
     pd.testing.assert_frame_equal(obs, exp)
 
 
-def test_normalize(meta_df):
-    exp = meta_df.data.copy().reset_index(drop=True)
+def test_normalize(test_df):
+    exp = test_df.data.copy().reset_index(drop=True)
     exp['value'][1::2] /= exp['value'][::2].values
     exp['value'][::2] /= exp['value'][::2].values
-    if "year" in meta_df.data:
-        obs = meta_df.normalize(year=2005).data.reset_index(drop=True)
+    if "year" in test_df.data:
+        obs = test_df.normalize(year=2005).data.reset_index(drop=True)
     else:
-        obs = meta_df.normalize(
+        obs = test_df.normalize(
             time=datetime.datetime(2005, 6, 17)
         ).data.reset_index(drop=True)
     pd.testing.assert_frame_equal(obs, exp)
 
 
-def test_normalize_not_time(meta_df):
-    pytest.raises(ValueError, meta_df.normalize, variable='foo')
-    pytest.raises(ValueError, meta_df.normalize, year=2015, variable='foo')
+def test_normalize_not_time(test_df):
+    pytest.raises(ValueError, test_df.normalize, variable='foo')
+    pytest.raises(ValueError, test_df.normalize, year=2015, variable='foo')
 
 
 @pytest.mark.parametrize("inplace", [True, False])
diff --git a/tests/test_feature_aggregate.py b/tests/test_feature_aggregate.py
index 31f4cd344..59d90771c 100644
--- a/tests/test_feature_aggregate.py
+++ b/tests/test_feature_aggregate.py
@@ -54,26 +54,26 @@ def test_aggregate_region_missing_all_subregions():
     assert len(obs) == 0
 
 
-def test_do_aggregate_append(meta_df):
-    meta_df.rename({'variable': {'Primary Energy': 'Primary Energy|Gas'}},
+def test_do_aggregate_append(test_df):
+    test_df.rename({'variable': {'Primary Energy': 'Primary Energy|Gas'}},
                    inplace=True)
-    meta_df.aggregate('Primary Energy', append=True)
-    obs = meta_df.filter(variable='Primary Energy').timeseries()
+    test_df.aggregate('Primary Energy', append=True)
+    df = test_df.filter(variable='Primary Energy')
 
-    dts = TEST_DTS
-    times = [2005, 2010] if "year" in meta_df.data else dts
+    times = [2005, 2010] if "year" in test_df.data else TEST_DTS
     exp = pd.DataFrame([
         ['model_a', 'scen_a', 'World', 'Primary Energy', 'EJ/y', 1.5, 9.],
         ['model_a', 'scen_b', 'World', 'Primary Energy', 'EJ/y', 2, 7],
     ],
         columns=['model', 'scenario', 'region', 'variable', 'unit'] + times
     ).set_index(IAMC_IDX)
-    if "year" in meta_df.data:
+    if "year" in test_df.data:
         exp.columns = list(map(int, exp.columns))
     else:
+        df.data.time = df.data.time.dt.normalize()
         exp.columns = pd.to_datetime(exp.columns)
 
-    pd.testing.assert_frame_equal(obs, exp)
+    pd.testing.assert_frame_equal(df.timeseries(), exp)
 
 
 def test_check_aggregate_pass(check_aggregate_df):
@@ -97,16 +97,16 @@ def test_check_internal_consistency_no_world_for_variable(
     assert caplog.records[warn_idx].levelname == "INFO"
 
 
-def test_check_aggregate_fail(meta_df):
-    obs = meta_df.check_aggregate('Primary Energy', exclude_on_fail=True)
+def test_check_aggregate_fail(test_df):
+    obs = test_df.check_aggregate('Primary Energy', exclude_on_fail=True)
     assert len(obs.columns) == 2
     assert obs.index.get_values()[0] == (
         'model_a', 'scen_a', 'World', 'Primary Energy', 'EJ/y'
     )
 
 
-def test_check_aggregate_top_level(meta_df):
-    obs = check_aggregate(meta_df, variable='Primary Energy', year=2005)
+def test_check_aggregate_top_level(test_df):
+    obs = check_aggregate(test_df, variable='Primary Energy', year=2005)
     assert len(obs.columns) == 1
     assert obs.index.get_values()[0] == (
         'model_a', 'scen_a', 'World', 'Primary Energy', 'EJ/y'
diff --git a/tests/test_feature_append_rename_convert.py b/tests/test_feature_append_rename_convert.py
index b34d47c69..b2ab5b4de 100644
--- a/tests/test_feature_append_rename_convert.py
+++ b/tests/test_feature_append_rename_convert.py
@@ -30,20 +30,20 @@
 )).data.sort_values(by='region').reset_index(drop=True)
 
 
-def test_append_other_scenario(meta_df):
-    other = meta_df.filter(scenario='scen_b')\
+def test_append_other_scenario(test_df):
+    other = test_df.filter(scenario='scen_b')\
         .rename({'scenario': {'scen_b': 'scen_c'}})
 
-    meta_df.set_meta([0, 1], name='col1')
-    meta_df.set_meta(['a', 'b'], name='col2')
+    test_df.set_meta([0, 1], name='col1')
+    test_df.set_meta(['a', 'b'], name='col2')
 
     other.set_meta(2, name='col1')
     other.set_meta('x', name='col3')
 
-    df = meta_df.append(other)
+    df = test_df.append(other)
 
     # check that the original meta dataframe is not updated
-    obs = meta_df.meta.index.get_level_values(1)
+    obs = test_df.meta.index.get_level_values(1)
     npt.assert_array_equal(obs, ['scen_a', 'scen_b'])
 
     # assert that merging of meta works as expected
@@ -63,26 +63,26 @@ def test_append_other_scenario(meta_df):
     npt.assert_array_equal(ts.iloc[2].values, ts.iloc[3].values)
 
 
-def test_append_same_scenario(meta_df):
-    other = meta_df.filter(scenario='scen_b')\
+def test_append_same_scenario(test_df):
+    other = test_df.filter(scenario='scen_b')\
         .rename({'variable': {'Primary Energy': 'Primary Energy clone'}})
 
-    meta_df.set_meta([0, 1], name='col1')
+    test_df.set_meta([0, 1], name='col1')
 
     other.set_meta(2, name='col1')
     other.set_meta('b', name='col2')
 
     # check that non-matching meta raise an error
-    pytest.raises(ValueError, meta_df.append, other=other)
+    pytest.raises(ValueError, test_df.append, other=other)
 
     # check that ignoring meta conflict works as expetced
-    df = meta_df.append(other, ignore_meta_conflict=True)
+    df = test_df.append(other, ignore_meta_conflict=True)
 
     # check that the new meta.index is updated, but not the original one
-    npt.assert_array_equal(meta_df.meta.columns, ['exclude', 'col1'])
+    npt.assert_array_equal(test_df.meta.columns, ['exclude', 'col1'])
 
     # assert that merging of meta works as expected
-    exp = meta_df.meta.copy()
+    exp = test_df.meta.copy()
     exp['col2'] = [np.nan, 'b']
     pd.testing.assert_frame_equal(df.meta, exp)
 
@@ -148,38 +148,38 @@ def test_rename_data_cols_by_mixed():
     pd.testing.assert_frame_equal(obs, EXP_RENAME_DF, check_index_type=False)
 
 
-def test_rename_conflict(meta_df):
+def test_rename_conflict(test_df):
     mapping = {'scenario': {'scen_a': 'scen_b'}}
-    pytest.raises(ValueError, meta_df.rename, mapping, **mapping)
+    pytest.raises(ValueError, test_df.rename, mapping, **mapping)
 
 
-def test_rename_index_data_fail(meta_df):
+def test_rename_index_data_fail(test_df):
     mapping = {'scenario': {'scen_a': 'scen_c'},
                'variable': {'Primary Energy|Coal': 'Primary Energy|Gas'}}
-    pytest.raises(ValueError, meta_df.rename, mapping)
+    pytest.raises(ValueError, test_df.rename, mapping)
 
 
-def test_rename_index_fail_duplicates(meta_df):
+def test_rename_index_fail_duplicates(test_df):
     mapping = {'scenario': {'scen_a': 'scen_b'}}
-    pytest.raises(ValueError, meta_df.rename, mapping)
+    pytest.raises(ValueError, test_df.rename, mapping)
 
 
-def test_rename_index(meta_df):
+def test_rename_index(test_df):
     mapping = {'model': {'model_a': 'model_b'}}
-    obs = meta_df.rename(mapping, scenario={'scen_a': 'scen_c'})
+    obs = test_df.rename(mapping, scenario={'scen_a': 'scen_c'})
 
     # test data changes
-    dts = TEST_DTS
-    times = [2005, 2010] if 'year' in meta_df.data else dts
+    times = [2005, 2010] if 'year' in test_df.data else TEST_DTS
     exp = pd.DataFrame([
         ['model_b', 'scen_c', 'World', 'Primary Energy', 'EJ/y', 1, 6.],
         ['model_b', 'scen_c', 'World', 'Primary Energy|Coal', 'EJ/y', 0.5, 3],
         ['model_a', 'scen_b', 'World', 'Primary Energy', 'EJ/y', 2, 7],
     ], columns=['model', 'scenario', 'region', 'variable', 'unit'] + times
     ).set_index(IAMC_IDX).sort_index()
-    if "year" in meta_df.data:
+    if "year" in test_df.data:
         exp.columns = list(map(int, exp.columns))
     else:
+        obs.data.time = obs.data.time.dt.normalize()
         exp.columns = pd.to_datetime(exp.columns)
     pd.testing.assert_frame_equal(obs.timeseries().sort_index(), exp)
 
@@ -192,14 +192,13 @@ def test_rename_index(meta_df):
     pd.testing.assert_frame_equal(obs.meta, exp)
 
 
-def test_rename_append(meta_df):
+def test_rename_append(test_df):
     mapping = {'model': {'model_a': 'model_b'},
                'scenario': {'scen_a': 'scen_c'}}
-    obs = meta_df.rename(mapping, append=True)
+    obs = test_df.rename(mapping, append=True)
 
     # test data changes
-    dts = [dt.datetime(2005, 6, 17), dt.datetime(2010, 7, 21)]
-    times = [2005, 2010] if "year" in meta_df.data else dts
+    times = [2005, 2010] if "year" in test_df.data else TEST_DTS
     exp = pd.DataFrame([
         ['model_a', 'scen_a', 'World', 'Primary Energy', 'EJ/y', 1, 6.],
         ['model_a', 'scen_a', 'World', 'Primary Energy|Coal', 'EJ/y', 0.5, 3],
@@ -208,9 +207,10 @@ def test_rename_append(meta_df):
         ['model_b', 'scen_c', 'World', 'Primary Energy|Coal', 'EJ/y', 0.5, 3],
     ], columns=['model', 'scenario', 'region', 'variable', 'unit'] + times
     ).set_index(IAMC_IDX).sort_index()
-    if "year" in meta_df.data:
+    if "year" in test_df.data:
         exp.columns = list(map(int, exp.columns))
     else:
+        obs.data.time = obs.data.time.dt.normalize()
         exp.columns = pd.to_datetime(exp.columns)
     pd.testing.assert_frame_equal(obs.timeseries().sort_index(), exp)
 
diff --git a/tests/test_feature_compare.py b/tests/test_feature_compare.py
index e9813ae58..ceab4df3d 100644
--- a/tests/test_feature_compare.py
+++ b/tests/test_feature_compare.py
@@ -1,4 +1,3 @@
-import copy
 import datetime as dt
 
 
@@ -7,13 +6,13 @@
 from pyam import compare, IAMC_IDX
 
 
-def test_compare(meta_df):
-    clone = copy.deepcopy(meta_df)
+def test_compare(test_df):
+    clone = test_df.copy()
     clone.data.iloc[0, clone.data.columns.get_loc('value')] = 2
     clone.rename({'variable': {'Primary Energy|Coal': 'Primary Energy|Gas'}},
                  inplace=True)
 
-    obs = compare(meta_df, clone, right_label='meta_df', left_label='clone')
+    obs = compare(test_df, clone, right_label='test_df', left_label='clone')
 
     exp = pd.DataFrame([
         ['Primary Energy', 'EJ/y', dt.datetime(2005, 6, 17), 2, 1],
@@ -22,17 +21,20 @@ def test_compare(meta_df):
         ['Primary Energy|Gas', 'EJ/y', dt.datetime(2005, 6, 17), 0.5, np.nan],
         ['Primary Energy|Gas', 'EJ/y', dt.datetime(2010, 7, 21), 3, np.nan],
     ],
-        columns=['variable', 'unit', 'time', 'meta_df', 'clone'],
+        columns=['variable', 'unit', 'time', 'test_df', 'clone'],
     )
     exp['model'] = 'model_a'
     exp['scenario'] = 'scen_a'
     exp['region'] = 'World'
     time_col = 'time'
-    if 'year' in meta_df.data.columns:
+    if 'year' in test_df.data.columns:
         exp['year'] = exp['time'].apply(lambda x: x.year)
         exp = exp.drop('time', axis='columns')
         time_col = 'year'
+    else:
+        obs = obs.reset_index()
+        obs.time = obs.time.dt.normalize()
+        obs = obs.set_index(IAMC_IDX + [time_col])
 
     exp = exp.set_index(IAMC_IDX + [time_col])
-
     pd.testing.assert_frame_equal(obs, exp)
diff --git a/tests/test_feature_set_meta.py b/tests/test_feature_set_meta.py
index aa919fb3c..72d0f031f 100644
--- a/tests/test_feature_set_meta.py
+++ b/tests/test_feature_set_meta.py
@@ -7,137 +7,137 @@
                         labels=[[0, 0], [0, 1]], names=['model', 'scenario'])
 
 
-def test_set_meta_no_name(meta_df):
+def test_set_meta_no_name(test_df):
     idx = pd.MultiIndex(levels=[['a_scenario'], ['a_model'], ['some_region']],
                         labels=[[0], [0], [0]],
                         names=['scenario', 'model', 'region'])
     s = pd.Series(data=[0.3], index=idx)
-    pytest.raises(ValueError, meta_df.set_meta, s)
+    pytest.raises(ValueError, test_df.set_meta, s)
 
 
-def test_set_meta_as_named_series(meta_df):
+def test_set_meta_as_named_series(test_df):
     idx = pd.MultiIndex(levels=[['scen_a'], ['model_a'], ['some_region']],
                         labels=[[0], [0], [0]],
                         names=['scenario', 'model', 'region'])
 
     s = pd.Series(data=[0.3], index=idx, name='meta_values')
-    meta_df.set_meta(s)
+    test_df.set_meta(s)
 
     exp = pd.Series(data=[0.3, np.nan], index=EXP_IDX, name='meta_values')
-    pd.testing.assert_series_equal(meta_df['meta_values'], exp)
+    pd.testing.assert_series_equal(test_df['meta_values'], exp)
 
 
-def test_set_meta_as_unnamed_series(meta_df):
+def test_set_meta_as_unnamed_series(test_df):
     idx = pd.MultiIndex(levels=[['scen_a'], ['model_a'], ['some_region']],
                         labels=[[0], [0], [0]],
                         names=['scenario', 'model', 'region'])
 
     s = pd.Series(data=[0.3], index=idx)
-    meta_df.set_meta(s, name='meta_values')
+    test_df.set_meta(s, name='meta_values')
 
     exp = pd.Series(data=[0.3, np.nan], index=EXP_IDX, name='meta_values')
-    pd.testing.assert_series_equal(meta_df['meta_values'], exp)
+    pd.testing.assert_series_equal(test_df['meta_values'], exp)
 
 
-def test_set_meta_non_unique_index_fail(meta_df):
+def test_set_meta_non_unique_index_fail(test_df):
     idx = pd.MultiIndex(levels=[['model_a'], ['scen_a'], ['reg_a', 'reg_b']],
                         labels=[[0, 0], [0, 0], [0, 1]],
                         names=['model', 'scenario', 'region'])
     s = pd.Series([0.4, 0.5], idx)
-    pytest.raises(ValueError, meta_df.set_meta, s)
+    pytest.raises(ValueError, test_df.set_meta, s)
 
 
-def test_set_meta_non_existing_index_fail(meta_df):
+def test_set_meta_non_existing_index_fail(test_df):
     idx = pd.MultiIndex(levels=[['model_a', 'fail_model'],
                                 ['scen_a', 'fail_scenario']],
                         labels=[[0, 1], [0, 1]], names=['model', 'scenario'])
     s = pd.Series([0.4, 0.5], idx)
-    pytest.raises(ValueError, meta_df.set_meta, s)
+    pytest.raises(ValueError, test_df.set_meta, s)
 
 
-def test_set_meta_by_df(meta_df):
+def test_set_meta_by_df(test_df):
     df = pd.DataFrame([
         ['model_a', 'scen_a', 'some_region', 1],
     ], columns=['model', 'scenario', 'region', 'col'])
 
-    meta_df.set_meta(meta=0.3, name='meta_values', index=df)
+    test_df.set_meta(meta=0.3, name='meta_values', index=df)
 
     exp = pd.Series(data=[0.3, np.nan], index=EXP_IDX, name='meta_values')
-    pd.testing.assert_series_equal(meta_df['meta_values'], exp)
+    pd.testing.assert_series_equal(test_df['meta_values'], exp)
 
 
-def test_set_meta_as_series(meta_df):
+def test_set_meta_as_series(test_df):
     s = pd.Series([0.3, 0.4])
-    meta_df.set_meta(s, 'meta_series')
+    test_df.set_meta(s, 'meta_series')
 
     exp = pd.Series(data=[0.3, 0.4], index=EXP_IDX, name='meta_series')
-    pd.testing.assert_series_equal(meta_df['meta_series'], exp)
+    pd.testing.assert_series_equal(test_df['meta_series'], exp)
 
 
-def test_set_meta_as_int(meta_df):
-    meta_df.set_meta(3.2, 'meta_int')
+def test_set_meta_as_int(test_df):
+    test_df.set_meta(3.2, 'meta_int')
 
     exp = pd.Series(data=[3.2, 3.2], index=EXP_IDX, name='meta_int')
 
-    obs = meta_df['meta_int']
+    obs = test_df['meta_int']
     pd.testing.assert_series_equal(obs, exp)
 
 
-def test_set_meta_as_str(meta_df):
-    meta_df.set_meta('testing', name='meta_str')
+def test_set_meta_as_str(test_df):
+    test_df.set_meta('testing', name='meta_str')
 
     exp = pd.Series(data=['testing'] * 2, index=EXP_IDX, name='meta_str')
-    pd.testing.assert_series_equal(meta_df['meta_str'], exp)
+    pd.testing.assert_series_equal(test_df['meta_str'], exp)
 
 
-def test_set_meta_as_str_list(meta_df):
-    meta_df.set_meta(['testing', 'testing2'], name='category')
-    obs = meta_df.filter(category='testing')
+def test_set_meta_as_str_list(test_df):
+    test_df.set_meta(['testing', 'testing2'], name='category')
+    obs = test_df.filter(category='testing')
     assert obs['scenario'].unique() == 'scen_a'
 
 
-def test_set_meta_as_str_by_index(meta_df):
+def test_set_meta_as_str_by_index(test_df):
     idx = pd.MultiIndex(levels=[['model_a'], ['scen_a']],
                         labels=[[0], [0]], names=['model', 'scenario'])
 
-    meta_df.set_meta('foo', 'meta_str', idx)
+    test_df.set_meta('foo', 'meta_str', idx)
 
     exp = pd.Series(data=['foo', None], index=EXP_IDX, name='meta_str')
-    pd.testing.assert_series_equal(meta_df['meta_str'], exp)
+    pd.testing.assert_series_equal(test_df['meta_str'], exp)
 
 
-def test_set_meta_from_data(meta_df):
-    meta_df.set_meta_from_data('pe_2005', variable='Primary Energy', year=2005)
+def test_set_meta_from_data(test_df):
+    test_df.set_meta_from_data('pe_2005', variable='Primary Energy', year=2005)
     exp = pd.Series(data=[1., 2.], index=EXP_IDX, name='pe_2005')
-    pd.testing.assert_series_equal(meta_df['pe_2005'], exp)
+    pd.testing.assert_series_equal(test_df['pe_2005'], exp)
 
 
-def test_set_meta_from_data_max(meta_df):
-    meta_df.set_meta_from_data('pe_max_yr', variable='Primary Energy',
+def test_set_meta_from_data_max(test_df):
+    test_df.set_meta_from_data('pe_max_yr', variable='Primary Energy',
                                method=np.max)
     exp = pd.Series(data=[6., 7.], index=EXP_IDX, name='pe_max_yr')
-    pd.testing.assert_series_equal(meta_df['pe_max_yr'], exp)
+    pd.testing.assert_series_equal(test_df['pe_max_yr'], exp)
 
 
-def test_set_meta_from_data_mean(meta_df):
-    meta_df.set_meta_from_data('pe_mean', variable='Primary Energy',
+def test_set_meta_from_data_mean(test_df):
+    test_df.set_meta_from_data('pe_mean', variable='Primary Energy',
                                method=np.mean)
     exp = pd.Series(data=[3.5, 4.5], index=EXP_IDX, name='pe_mean')
-    pd.testing.assert_series_equal(meta_df['pe_mean'], exp)
+    pd.testing.assert_series_equal(test_df['pe_mean'], exp)
 
 
-def test_set_meta_from_data_method_other_column(meta_df):
-    if 'year' in meta_df.data.columns:
+def test_set_meta_from_data_method_other_column(test_df):
+    if 'year' in test_df.data.columns:
         col, value = 'year', 2010
     else:
-        col, value = 'time', pd.to_datetime('2010-07-21T00:00:00.0')
-    meta_df.set_meta_from_data('pe_max_yr', variable='Primary Energy',
+        col, value = 'time', max(test_df.data.time)
+    test_df.set_meta_from_data('pe_max_yr', variable='Primary Energy',
                                method=np.max, column=col)
     exp = pd.Series(data=[value] * 2, index=EXP_IDX, name='pe_max_yr')
-    pd.testing.assert_series_equal(meta_df['pe_max_yr'], exp)
+    pd.testing.assert_series_equal(test_df['pe_max_yr'], exp)
 
 
-def test_set_meta_from_data_nonunique(meta_df):
+def test_set_meta_from_data_nonunique(test_df):
     # the filtered `data` dataframe is not unique with regard to META_IDX
-    pytest.raises(ValueError, meta_df.set_meta_from_data, 'fail',
+    pytest.raises(ValueError, test_df.set_meta_from_data, 'fail',
                   variable='Primary Energy')
diff --git a/tests/test_io.py b/tests/test_io.py
index 3c4eeaed2..89ac744c2 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -10,16 +10,16 @@
 FILTER_ARGS = dict(scenario='scen_a')
 
 
-def test_io_csv(meta_df):
+def test_io_csv(test_df):
     # write to csv
     file = 'testing_io_write_read.csv'
-    meta_df.to_csv(file)
+    test_df.to_csv(file)
 
     # read from csv
     import_df = IamDataFrame(file)
 
     # assert that `data` tables are equal and delete file
-    pd.testing.assert_frame_equal(meta_df.data, import_df.data)
+    pd.testing.assert_frame_equal(test_df.data, import_df.data)
     os.remove(file)
 
 
@@ -27,14 +27,14 @@ def test_io_csv(meta_df):
     [{}, {}],
     [dict(include_meta='foo'), dict(meta_sheet_name='foo')]
 ])
-def test_io_xlsx(meta_df, meta_args):
+def test_io_xlsx(test_df, meta_args):
     # add column to `meta`
-    meta_df.set_meta(['a', 'b'], 'string')
+    test_df.set_meta(['a', 'b'], 'string')
 
     # write to xlsx (direct file name and ExcelWriter, see bug report #300)
     file = 'testing_io_write_read.xlsx'
     for f in [file, pd.ExcelWriter(file)]:
-        meta_df.to_excel(f, **meta_args[0])
+        test_df.to_excel(f, **meta_args[0])
         if isinstance(f, pd.ExcelWriter):
             f.close()
 
@@ -42,16 +42,16 @@ def test_io_xlsx(meta_df, meta_args):
         import_df = IamDataFrame(file, **meta_args[1])
 
         # assert that `data` and `meta` tables are equal and delete file
-        pd.testing.assert_frame_equal(meta_df.data, import_df.data)
-        pd.testing.assert_frame_equal(meta_df.meta, import_df.meta)
+        pd.testing.assert_frame_equal(test_df.data, import_df.data)
+        pd.testing.assert_frame_equal(test_df.meta, import_df.meta)
         os.remove(file)
 
 
 @pytest.mark.parametrize("args", [{}, dict(sheet_name='meta')])
-def test_load_meta(meta_df, args):
+def test_load_meta(test_df, args):
     file = os.path.join(TEST_DATA_DIR, 'testing_metadata.xlsx')
-    meta_df.load_meta(file, **args)
-    obs = meta_df.meta
+    test_df.load_meta(file, **args)
+    obs = test_df.meta
 
     dct = {'model': ['model_a'] * 2, 'scenario': ['scen_a', 'scen_b'],
            'category': ['imported', np.nan], 'exclude': [False, False]}

From 4636f9042b470e0db9a9cd99ba3f5d02475c8811 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Sat, 14 Dec 2019 06:57:38 +0100
Subject: [PATCH 04/26] appease stickler

---
 tests/test_cast_to_iamc.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_cast_to_iamc.py b/tests/test_cast_to_iamc.py
index 522b0d827..e21258297 100644
--- a/tests/test_cast_to_iamc.py
+++ b/tests/test_cast_to_iamc.py
@@ -4,6 +4,7 @@
 
 from conftest import TEST_DTS
 
+
 def test_cast_from_value_col(test_df_year):
     df_with_value_cols = pd.DataFrame([
         ['model_a', 'scen_a', 'World', 'EJ/y', 2005, 1, 0.5],

From c22dca08cd669d158ae8dfabe8277074c23f3861 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Sat, 14 Dec 2019 14:54:35 +0100
Subject: [PATCH 05/26] merge relevant changes from
 `peterkolp:region_aggregation_mip_feature`

---
 pyam/core.py                    | 48 ++++++++++++++++++++++++---------
 pyam/utils.py                   |  2 ++
 tests/test_feature_aggregate.py |  6 +++++
 3 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/pyam/core.py b/pyam/core.py
index 4da8c568b..eb65ca470 100644
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -38,6 +38,7 @@
     YEAR_IDX,
     IAMC_IDX,
     SORT_IDX,
+    KNOWN_FUNCS
 )
 from pyam.read_ixmp import read_ix
 from pyam.timeseries import fill_series
@@ -766,7 +767,7 @@ def normalize(self, inplace=False, **kwargs):
         if not inplace:
             return ret
 
-    def aggregate(self, variable, components=None, append=False):
+    def aggregate(self, variable, components=None, method='sum', append=False):
         """Compute the aggregate of timeseries components or sub-categories
 
         Parameters
@@ -775,6 +776,9 @@ def aggregate(self, variable, components=None, append=False):
             variable for which the aggregate should be computed
         components: list of str, default None
             list of variables, defaults to all sub-categories of `variable`
+        method: func or str
+            method to use for aggregation
+            e.g. np.mean, np.sum, ... 'min', 'max', ...
         append: bool, default False
             append the aggregate timeseries to `data` and return None,
             else return aggregate timeseries
@@ -789,15 +793,15 @@ def aggregate(self, variable, components=None, append=False):
             return
 
         rows = self._apply_filters(variable=components)
-        _data = _aggregate(self.data[rows], 'variable')
+        _data = _aggregate(self.data[rows], 'variable', method)
 
         if append is True:
             self.append(_data, variable=variable, inplace=True)
         else:
             return _data
 
-    def check_aggregate(self, variable, components=None, exclude_on_fail=False,
-                        multiplier=1, **kwargs):
+    def check_aggregate(self, variable, components=None, method='sum',
+                        exclude_on_fail=False, multiplier=1, **kwargs):
         """Check whether a timeseries matches the aggregation of its components
 
         Parameters
@@ -806,6 +810,8 @@ def check_aggregate(self, variable, components=None, exclude_on_fail=False,
             variable to be checked for matching aggregation of sub-categories
         components: list of str, default None
             list of variables, defaults to all sub-categories of `variable`
+        method: func or str
+            method to use for aggregation, e.g. ['mean', np.min]
         exclude_on_fail: boolean, default False
             flag scenarios failing validation as `exclude: True`
         multiplier: number, default 1
@@ -820,7 +826,8 @@ def check_aggregate(self, variable, components=None, exclude_on_fail=False,
         # filter and groupby data, use `pd.Series.align` for matching index
         rows = self._apply_filters(variable=variable)
         df_variable, df_components = (
-            _aggregate(self.data[rows], 'variable').align(df_components)
+            _aggregate(self.data[rows], 'variable', method)
+            .align(df_components)
         )
 
         # use `np.isclose` for checking match
@@ -837,7 +844,7 @@ def check_aggregate(self, variable, components=None, exclude_on_fail=False,
             return IamDataFrame(diff, variable=variable).timeseries()
 
     def aggregate_region(self, variable, region='World', subregions=None,
-                         components=None, append=False):
+                         components=None, append=False, method='sum'):
         """Compute the aggregate of timeseries over a number of regions
         including variable components only defined at the `region` level
 
@@ -858,9 +865,7 @@ def aggregate_region(self, variable, region='World', subregions=None,
             else return aggregate timeseries
         """
         # default subregions to all regions other than `region`
-        if subregions is None:
-            rows = self._apply_filters(variable=variable)
-            subregions = set(self.data[rows].region) - set([region])
+        subregions = subregions or self._all_other_regions(region, variable)
 
         if not len(subregions):
             msg = 'cannot aggregate variable `{}` to `{}` because it does not'\
@@ -872,7 +877,8 @@ def aggregate_region(self, variable, region='World', subregions=None,
         # compute aggregate over all subregions
         subregion_df = self.filter(region=subregions)
         cols = ['region', 'variable']
-        _data = _aggregate(subregion_df.filter(variable=variable).data, cols)
+        _data = _aggregate(subregion_df.filter(variable=variable).data,
+                           cols, method=method)
 
         # add components at the `region` level, defaults to all variables one
         # level below `variable` that are only present in `region`
@@ -893,6 +899,11 @@ def aggregate_region(self, variable, region='World', subregions=None,
         else:
             return _data
 
+    def _all_other_regions(self, region, variable):
+        """Determine subregions as all regions other than `region`"""
+        rows = self._apply_filters(variable=variable)
+        return set(self.data[rows].region) - set([region])
+
     def check_aggregate_region(self, variable, region='World', subregions=None,
                                components=None, exclude_on_fail=False,
                                **kwargs):
@@ -1444,11 +1455,24 @@ def _meta_idx(data):
     return data[META_IDX].drop_duplicates().set_index(META_IDX).index
 
 
-def _aggregate(df, by):
+def _aggregate(df, by, method=np.sum):
     """Aggregate `df` by specified column(s), return indexed `pd.Series`"""
     by = [by] if isstr(by) else by
     cols = [c for c in list(df.columns) if c not in ['value'] + by]
-    return df.groupby(cols).sum()['value']
+    # pick aggregator func (default: sum)
+    return df.groupby(cols)['value'].agg(_get_method_func(method))
+
+
+def _get_method_func(method):
+    """Translate a string to a known method"""
+    if not isstr(method):
+        return method
+
+    if method in KNOWN_FUNCS:
+        return KNOWN_FUNCS[method]
+
+    # raise error if `method` is a string but not in dict of known methods
+    raise ValueError('method `{}` is not a known aggregator'.format(method))
 
 
 def _raise_filter_error(col):
diff --git a/pyam/utils.py b/pyam/utils.py
index 4b3ad9f31..9a30fbe3d 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -33,6 +33,8 @@
                           + ['{}{}'.format(i, j) for i, j in itertools.product(
                               string.ascii_uppercase, string.ascii_uppercase)]))
 
+KNOWN_FUNCS = {'min': np.min, 'max': np.max, 'avg': np.mean, 'sum': np.sum}
+
 
 def requires_package(pkg, msg, error_type=ImportError):
     """Decorator when a function requires an optional dependency
diff --git a/tests/test_feature_aggregate.py b/tests/test_feature_aggregate.py
index 59d90771c..0e41827e6 100644
--- a/tests/test_feature_aggregate.py
+++ b/tests/test_feature_aggregate.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pandas as pd
+import pytest
 from pyam import check_aggregate, IamDataFrame, IAMC_IDX
 
 from conftest import TEST_DTS
@@ -76,6 +77,11 @@ def test_do_aggregate_append(test_df):
     pd.testing.assert_frame_equal(df.timeseries(), exp)
 
 
+def test_aggregate_unknown_method(reg_df):
+    pytest.raises(ValueError, reg_df.aggregate_region, 'Primary Energy',
+                  method='foo')
+
+
 def test_check_aggregate_pass(check_aggregate_df):
     obs = check_aggregate_df.filter(
         scenario='a_scen'

From 7ef35167484526109a8ae2f38ac731a018467721 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Sat, 14 Dec 2019 15:10:46 +0100
Subject: [PATCH 06/26] docstring clean-up

---
 pyam/core.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pyam/core.py b/pyam/core.py
index eb65ca470..7864ed00f 100644
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -777,8 +777,7 @@ def aggregate(self, variable, components=None, method='sum', append=False):
         components: list of str, default None
             list of variables, defaults to all sub-categories of `variable`
         method: func or str
-            method to use for aggregation
-            e.g. np.mean, np.sum, ... 'min', 'max', ...
+            method to use for aggregation, e.g. np.mean, np.sum, 'min', 'max'
         append: bool, default False
             append the aggregate timeseries to `data` and return None,
             else return aggregate timeseries
@@ -811,7 +810,7 @@ def check_aggregate(self, variable, components=None, method='sum',
         components: list of str, default None
             list of variables, defaults to all sub-categories of `variable`
         method: func or str
-            method to use for aggregation, e.g. ['mean', np.min]
+            method to use for aggregation, e.g. np.mean, np.sum, 'min', 'max'
         exclude_on_fail: boolean, default False
             flag scenarios failing validation as `exclude: True`
         multiplier: number, default 1
@@ -844,7 +843,7 @@ def check_aggregate(self, variable, components=None, method='sum',
             return IamDataFrame(diff, variable=variable).timeseries()
 
     def aggregate_region(self, variable, region='World', subregions=None,
-                         components=None, append=False, method='sum'):
+                         components=None, method='sum', append=False):
         """Compute the aggregate of timeseries over a number of regions
         including variable components only defined at the `region` level
 
@@ -860,6 +859,8 @@ def aggregate_region(self, variable, region='World', subregions=None,
             list of variables to include in the aggregate from the `region`
             level, defaults to all sub-categories of `variable` included in
             `region` but not in any of `subregions`
+        method: func or str
+            method to use for aggregation, e.g. np.mean, np.sum, 'min', 'max'
         append: bool, default False
             append the aggregate timeseries to `data` and return None,
             else return aggregate timeseries

From 9969c747eccb8f097d684a07c934ed583f225cf8 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Sat, 14 Dec 2019 16:41:28 +0100
Subject: [PATCH 07/26] set `compenents=False` as default in
 `[check_]aggregate_region()`

---
 doc/source/tutorials/checking_databases.ipynb |  2 +-
 pyam/core.py                                  | 36 +++++++++++--------
 tests/test_feature_aggregate.py               | 23 +++++++-----
 3 files changed, 36 insertions(+), 25 deletions(-)
 mode change 100644 => 100755 pyam/core.py

diff --git a/doc/source/tutorials/checking_databases.ipynb b/doc/source/tutorials/checking_databases.ipynb
index e4461abfa..19c503517 100644
--- a/doc/source/tutorials/checking_databases.ipynb
+++ b/doc/source/tutorials/checking_databases.ipynb
@@ -4059,7 +4059,7 @@
    "source": [
     "for variable in consistent_df.filter(level=1).variables():\n",
     "    diff = consistent_df.check_aggregate_region(\n",
-    "        variable, \n",
+    "        variable, components=True,\n",
     "        **np_isclose_args\n",
     "    )\n",
     "    assert diff is None"
diff --git a/pyam/core.py b/pyam/core.py
old mode 100644
new mode 100755
index 7864ed00f..3bd987f33
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -843,7 +843,7 @@ def check_aggregate(self, variable, components=None, method='sum',
             return IamDataFrame(diff, variable=variable).timeseries()
 
     def aggregate_region(self, variable, region='World', subregions=None,
-                         components=None, method='sum', append=False):
+                         components=False, method='sum', append=False):
         """Compute the aggregate of timeseries over a number of regions
         including variable components only defined at the `region` level
 
@@ -855,10 +855,11 @@ def aggregate_region(self, variable, region='World', subregions=None,
             dimension
         subregions: list of str
             list of subregions, defaults to all regions other than `region`
-        components: list of str
-            list of variables to include in the aggregate from the `region`
-            level, defaults to all sub-categories of `variable` included in
-            `region` but not in any of `subregions`
+        components: bool or list of str, default False
+            variables at the `region` level to be included in the aggregation
+            (ignored if False); if `True`, use all sub-categories of `variable`
+            included in `region` but not in any of the `subregions`;
+            or explicit list of variables
         method: func or str
             method to use for aggregation, e.g. np.mean, np.sum, 'min', 'max'
         append: bool, default False
@@ -886,11 +887,13 @@ def aggregate_region(self, variable, region='World', subregions=None,
         with adjust_log_level(logger):
             region_df = self.filter(region=region)
 
-        rdf_comps = region_df._variable_components(variable, level=None)
-        srdf_comps = subregion_df._variable_components(variable, level=None)
-        components = components or set(rdf_comps).difference(srdf_comps)
+        # if `True`, auto-detect `components` at the `region` level
+        if components is True:
+            r_comps = region_df._variable_components(variable, level=None)
+            sr_comps = subregion_df._variable_components(variable, level=None)
+            components = set(r_comps).difference(sr_comps)
 
-        if len(components):
+        if components is not False and len(components):
             rows = region_df._apply_filters(variable=components)
             _data = _data.add(_aggregate(region_df.data[rows], cols),
                               fill_value=0)
@@ -906,7 +909,7 @@ def _all_other_regions(self, region, variable):
         return set(self.data[rows].region) - set([region])
 
     def check_aggregate_region(self, variable, region='World', subregions=None,
-                               components=None, exclude_on_fail=False,
+                               components=False, exclude_on_fail=False,
                                **kwargs):
         """Check whether the region timeseries data match the aggregation
         of components
@@ -919,9 +922,11 @@ def check_aggregate_region(self, variable, region='World', subregions=None,
             region to be checked for matching aggregation of subregions
         subregions: list of str
             list of subregions, defaults to all regions other than `region`
-        components: list of str, default None
-            list of variables, defaults to all sub-categories of `variable`
-            included in `region` but not in any of `subregions`
+        components: bool or list of str, default False
+            variables at the `region` level to be included in the aggregation
+            (ignored if False); if `True`, use all sub-categories of `variable`
+            included in `region` but not in any of the `subregions`;
+            or explicit list of variables
         exclude_on_fail: boolean, default False
             flag scenarios failing validation as `exclude: True`
         kwargs: passed to `np.isclose()`
@@ -970,7 +975,7 @@ def _variable_components(self, variable, level=0):
                                       level=level)]
 
     def check_internal_consistency(self, **kwargs):
-        """Check whether the database is internally consistent
+        """Check whether a scenario ensemble is internally consistent
 
         We check that all variables are equal to the sum of their sectoral
         components and that all the regions add up to the World total. If
@@ -993,7 +998,8 @@ def check_internal_consistency(self, **kwargs):
             if diff_agg is not None:
                 inconsistent_vars[variable + "-aggregate"] = diff_agg
 
-            diff_regional = self.check_aggregate_region(variable, **kwargs)
+            diff_regional = self.check_aggregate_region(variable,
+                        components=True, **kwargs)
             if diff_regional is not None:
                 inconsistent_vars[variable + "-regional"] = diff_regional
 
diff --git a/tests/test_feature_aggregate.py b/tests/test_feature_aggregate.py
index 0e41827e6..3fe92141e 100644
--- a/tests/test_feature_aggregate.py
+++ b/tests/test_feature_aggregate.py
@@ -129,11 +129,11 @@ def test_df_check_aggregate_pass(check_aggregate_df):
 
 
 def test_df_check_aggregate_region_pass(check_aggregate_df):
-    obs = check_aggregate_df.check_aggregate_region('Primary Energy')
+    obs = check_aggregate_df.check_aggregate_region('Primary Energy', components=True)
     assert obs is None
 
     for variable in check_aggregate_df.variables():
-        obs = check_aggregate_df.check_aggregate_region(variable)
+        obs = check_aggregate_df.check_aggregate_region(variable, components=True)
         assert obs is None
 
 
@@ -169,7 +169,7 @@ def run_check_agg_fail(pyam_df, tweak_dict, test_type):
             )
         elif 'region' in test_type:
             obs = pyam_df.check_aggregate_region(
-                variable,
+                variable, components=True
             )
 
         if obs is not None:
@@ -260,34 +260,39 @@ def test_df_check_aggregate_region_errors(check_aggregate_regional_df):
 
 def test_df_check_aggregate_region_components(check_aggregate_regional_df):
     obs = check_aggregate_regional_df.check_aggregate_region(
-        'Emissions|N2O', 'World', subregions=['REUROPE', 'RASIA']
+        'Emissions|N2O', 'World', subregions=['REUROPE', 'RASIA'],
+        components=True
     )
     assert obs is None
 
     obs = check_aggregate_regional_df.check_aggregate_region(
-        'Emissions|N2O|Ind|Solvents', 'World', subregions=['REUROPE', 'RASIA']
+        'Emissions|N2O|Ind|Solvents', 'World', subregions=['REUROPE', 'RASIA'],
+        components=True
     )
     assert obs is None
 
     obs = check_aggregate_regional_df.check_aggregate_region(
-        'Emissions|N2O', 'REUROPE', subregions=['Germany', 'UK']
+        'Emissions|N2O', 'REUROPE', subregions=['Germany', 'UK'],
+        components=True
     )
     assert obs is None
 
     obs = check_aggregate_regional_df.check_aggregate_region(
-        'Emissions|N2O', 'RASIA', subregions=['China', 'Japan']
+        'Emissions|N2O', 'RASIA', subregions=['China', 'Japan'],
+        components=True
     )
     assert obs is None
 
     obs = check_aggregate_regional_df.check_aggregate_region(
-        'Emissions|N2O|Ind|Transport', 'REUROPE', subregions=['Germany', 'UK']
+        'Emissions|N2O|Ind|Transport', 'REUROPE', subregions=['Germany', 'UK'],
+        components=True
     )
     assert obs is None
 
 
 @pytest.mark.parametrize("components,exp_vals", (
     # should find sub-components including nested bunkers
-    (None, [1.9, 15.7]),
+    (True, [1.9, 15.7]),
     # should only add AFOLU onto regional sum, not Shipping emissions
     (["Emissions|N2O|AFOLU"], [0.9, 9.7]),
     # specifying Ind leads to double counting (and not skipping AFOLU) but as

From 6dc3228dff5a60d88aa655f6ffbe1c62077ac2d7 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Sat, 14 Dec 2019 18:09:21 +0100
Subject: [PATCH 08/26] fix `method` docstring, add `weights` kwarg

---
 pyam/core.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/pyam/core.py b/pyam/core.py
index 3bd987f33..4a8d6e9e4 100755
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -776,7 +776,7 @@ def aggregate(self, variable, components=None, method='sum', append=False):
             variable for which the aggregate should be computed
         components: list of str, default None
             list of variables, defaults to all sub-categories of `variable`
-        method: func or str
+        method: func or str, default 'sum'
             method to use for aggregation, e.g. np.mean, np.sum, 'min', 'max'
         append: bool, default False
             append the aggregate timeseries to `data` and return None,
@@ -809,7 +809,7 @@ def check_aggregate(self, variable, components=None, method='sum',
             variable to be checked for matching aggregation of sub-categories
         components: list of str, default None
             list of variables, defaults to all sub-categories of `variable`
-        method: func or str
+        method: func or str, default 'sum'
             method to use for aggregation, e.g. np.mean, np.sum, 'min', 'max'
         exclude_on_fail: boolean, default False
             flag scenarios failing validation as `exclude: True`
@@ -843,7 +843,8 @@ def check_aggregate(self, variable, components=None, method='sum',
             return IamDataFrame(diff, variable=variable).timeseries()
 
     def aggregate_region(self, variable, region='World', subregions=None,
-                         components=False, method='sum', append=False):
+                         components=False, method='sum', weights=None,
+                         append=False):
         """Compute the aggregate of timeseries over a number of regions
         including variable components only defined at the `region` level
 
@@ -860,8 +861,10 @@ def aggregate_region(self, variable, region='World', subregions=None,
             (ignored if False); if `True`, use all sub-categories of `variable`
             included in `region` but not in any of the `subregions`;
             or explicit list of variables
-        method: func or str
+        method: func or str, default 'sum'
             method to use for aggregation, e.g. np.mean, np.sum, 'min', 'max'
+        weights: str, default None
+            variable to use as weights for the aggregation
         append: bool, default False
             append the aggregate timeseries to `data` and return None,
             else return aggregate timeseries
@@ -909,8 +912,8 @@ def _all_other_regions(self, region, variable):
         return set(self.data[rows].region) - set([region])
 
     def check_aggregate_region(self, variable, region='World', subregions=None,
-                               components=False, exclude_on_fail=False,
-                               **kwargs):
+                               components=False, method='sum', weights='sum',
+                               exclude_on_fail=False, **kwargs):
         """Check whether the region timeseries data match the aggregation
         of components
 
@@ -927,13 +930,17 @@ def check_aggregate_region(self, variable, region='World', subregions=None,
             (ignored if False); if `True`, use all sub-categories of `variable`
             included in `region` but not in any of the `subregions`;
             or explicit list of variables
+        method: func or str, default 'sum'
+            method to use for aggregation, e.g. np.mean, np.sum, 'min', 'max'
+        weights: str, default None
+            variable to use as weights for the aggregation
         exclude_on_fail: boolean, default False
             flag scenarios failing validation as `exclude: True`
         kwargs: passed to `np.isclose()`
         """
         # compute aggregate from subregions, return None if no subregions
         df_subregions = self.aggregate_region(variable, region, subregions,
-                                              components)
+                                              components, method, weights)
         if df_subregions is None:
             return
 

From 978829b45d90a8915f491b02c3217ca9d27fe474 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Sat, 14 Dec 2019 20:55:42 +0100
Subject: [PATCH 09/26] move internal function `_all_other_regions()`

---
 pyam/core.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pyam/core.py b/pyam/core.py
index 4a8d6e9e4..4b28fadcb 100755
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -906,11 +906,6 @@ def aggregate_region(self, variable, region='World', subregions=None,
         else:
             return _data
 
-    def _all_other_regions(self, region, variable):
-        """Determine subregions as all regions other than `region`"""
-        rows = self._apply_filters(variable=variable)
-        return set(self.data[rows].region) - set([region])
-
     def check_aggregate_region(self, variable, region='World', subregions=None,
                                components=False, method='sum', weights='sum',
                                exclude_on_fail=False, **kwargs):
@@ -971,6 +966,11 @@ def check_aggregate_region(self, variable, region='World', subregions=None,
             col_args = dict(region=region, variable=variable)
             return IamDataFrame(diff, **col_args).timeseries()
 
+    def _all_other_regions(self, region, variable):
+        """Determine subregions as all regions other than `region`"""
+        rows = self._apply_filters(variable=variable)
+        return set(self.data[rows].region) - set([region])
+
     def _variable_components(self, variable, level=0):
         """Get all components (sub-categories) of a variable for a given level
 

From bf5813c178359b67f98a357a5a68a8b44d9758c4 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Sat, 14 Dec 2019 20:57:23 +0100
Subject: [PATCH 10/26] update docstring

---
 pyam/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyam/core.py b/pyam/core.py
index 4b28fadcb..094982674 100755
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -967,7 +967,7 @@ def check_aggregate_region(self, variable, region='World', subregions=None,
             return IamDataFrame(diff, **col_args).timeseries()
 
     def _all_other_regions(self, region, variable):
-        """Determine subregions as all regions other than `region`"""
+        """Return list of regions other than `region` containing `variable`"""
         rows = self._apply_filters(variable=variable)
         return set(self.data[rows].region) - set([region])
 

From 02adebc60b6596e573489ae8c8dec46f21a968ac Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Sat, 14 Dec 2019 21:25:50 +0100
Subject: [PATCH 11/26] speed-up of `aggregate_region` (no cloning of
 IamDataFrame)

---
 pyam/core.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyam/core.py b/pyam/core.py
index 094982674..115aef20c 100755
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -882,8 +882,8 @@ def aggregate_region(self, variable, region='World', subregions=None,
         # compute aggregate over all subregions
         subregion_df = self.filter(region=subregions)
         cols = ['region', 'variable']
-        _data = _aggregate(subregion_df.filter(variable=variable).data,
-                           cols, method=method)
+        rows = subregion_df._apply_filters(variable=variable)
+        _data = _aggregate(subregion_df.data[rows], cols, method=method)
 
         # add components at the `region` level, defaults to all variables one
         # level below `variable` that are only present in `region`

From 4a5e359e2cf16b9ca8554bcba4a12079d662076f Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Sat, 14 Dec 2019 21:36:41 +0100
Subject: [PATCH 12/26] fix a kwarg default, add docstrings

---
 pyam/core.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyam/core.py b/pyam/core.py
index 115aef20c..5a2cc89e5 100755
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -865,6 +865,7 @@ def aggregate_region(self, variable, region='World', subregions=None,
             method to use for aggregation, e.g. np.mean, np.sum, 'min', 'max'
         weights: str, default None
             variable to use as weights for the aggregation
+            (currently only supported with `method='sum'`)
         append: bool, default False
             append the aggregate timeseries to `data` and return None,
             else return aggregate timeseries
@@ -907,7 +908,7 @@ def aggregate_region(self, variable, region='World', subregions=None,
             return _data
 
     def check_aggregate_region(self, variable, region='World', subregions=None,
-                               components=False, method='sum', weights='sum',
+                               components=False, method='sum', weights=None,
                                exclude_on_fail=False, **kwargs):
         """Check whether the region timeseries data match the aggregation
         of components
@@ -929,6 +930,7 @@ def check_aggregate_region(self, variable, region='World', subregions=None,
             method to use for aggregation, e.g. np.mean, np.sum, 'min', 'max'
         weights: str, default None
             variable to use as weights for the aggregation
+            (currently only supported with `method='sum'`)
         exclude_on_fail: boolean, default False
             flag scenarios failing validation as `exclude: True`
         kwargs: passed to `np.isclose()`

From fa152e29c5db37f96cd153ee7766d4eb022cbb01 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Sat, 14 Dec 2019 21:57:13 +0100
Subject: [PATCH 13/26] speed up `aggregate_region()` even more

---
 pyam/core.py | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/pyam/core.py b/pyam/core.py
index 5a2cc89e5..841511d14 100755
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -886,21 +886,23 @@ def aggregate_region(self, variable, region='World', subregions=None,
         rows = subregion_df._apply_filters(variable=variable)
         _data = _aggregate(subregion_df.data[rows], cols, method=method)
 
-        # add components at the `region` level, defaults to all variables one
-        # level below `variable` that are only present in `region`
-        with adjust_log_level(logger):
-            region_df = self.filter(region=region)
-
-        # if `True`, auto-detect `components` at the `region` level
-        if components is True:
-            r_comps = region_df._variable_components(variable, level=None)
-            sr_comps = subregion_df._variable_components(variable, level=None)
-            components = set(r_comps).difference(sr_comps)
-
-        if components is not False and len(components):
-            rows = region_df._apply_filters(variable=components)
-            _data = _data.add(_aggregate(region_df.data[rows], cols),
-                              fill_value=0)
+        # if not `components=False`, add components at the `region` level
+        if components is not False:
+            with adjust_log_level(logger):
+                region_df = self.filter(region=region)
+
+            # if `True`, auto-detect `components` at the `region` level,
+            # defaults to variables below `variable` only present in `region`
+            if components is True:
+                level = dict(level=None)
+                r_comps = region_df._variable_components(variable, **level)
+                sr_comps = subregion_df._variable_components(variable, **level)
+                components = set(r_comps).difference(sr_comps)
+
+            if len(components):
+                rows = region_df._apply_filters(variable=components)
+                _data = _data.add(_aggregate(region_df.data[rows], cols),
+                                  fill_value=0)
 
         if append is True:
             self.append(_data, region=region, variable=variable, inplace=True)

From 4e7fdc21fbc16ff72576be589008bfdce2c2d970 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Sat, 14 Dec 2019 21:58:34 +0100
Subject: [PATCH 14/26] add feature to do weighted average over regions

---
 pyam/core.py | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/pyam/core.py b/pyam/core.py
index 841511d14..687e68293 100755
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -870,6 +870,10 @@ def aggregate_region(self, variable, region='World', subregions=None,
             append the aggregate timeseries to `data` and return None,
             else return aggregate timeseries
         """
+        if weights is not None and components is not False:
+            msg = 'using weights and components in one operation not supported'
+            raise ValueError(msg)
+
         # default subregions to all regions other than `region`
         subregions = subregions or self._all_other_regions(region, variable)
 
@@ -884,7 +888,12 @@ def aggregate_region(self, variable, region='World', subregions=None,
         subregion_df = self.filter(region=subregions)
         cols = ['region', 'variable']
         rows = subregion_df._apply_filters(variable=variable)
-        _data = _aggregate(subregion_df.data[rows], cols, method=method)
+        if weights is None:
+            _data = _aggregate(subregion_df.data[rows], cols, method=method)
+        else:
+            weight_rows = subregion_df._apply_filters(variable=weights)
+            _data = _aggregate_weights(subregion_df.data[rows],
+                                       subregion_df.data[weight_rows], method)
 
         # if not `components=False`, add components at the `region` level
         if components is not False:
@@ -1481,6 +1490,19 @@ def _aggregate(df, by, method=np.sum):
     return df.groupby(cols)['value'].agg(_get_method_func(method))
 
 
+def _aggregate_weights(df, weights, method):
+    """Aggregate `df` by regions with weights, return indexed `pd.Series`"""
+    # only summation allowed with weights
+    if method not in ['sum', np.sum]:
+        raise ValueError('only method `np.sum` allowed for weighted average')
+
+    _data = _get_value_col(df, YEAR_IDX)
+    _weight = _get_value_col(weights, YEAR_IDX)
+
+    cols = META_IDX + ['year']
+    return (_data * _weight).groupby(cols).sum() / _weight.groupby(cols).sum()
+
+
 def _get_method_func(method):
     """Translate a string to a known method"""
     if not isstr(method):
@@ -1493,6 +1515,10 @@ def _get_method_func(method):
     raise ValueError('method `{}` is not a known aggregator'.format(method))
 
 
+def _get_value_col(df, cols):
+    """Return the value column as `pd.Series with `cols` as index"""
+    return df.set_index(cols)['value']
+
 def _raise_filter_error(col):
     raise ValueError('filter by `{}` not supported'.format(col))
 

From 92b292cf150b8195b6dd73397cdb2abf9d22f81b Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Sat, 14 Dec 2019 22:08:06 +0100
Subject: [PATCH 15/26] refactor kwarg and auxiliary function to `weight`

---
 pyam/core.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/pyam/core.py b/pyam/core.py
index 687e68293..65078ff07 100755
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -843,7 +843,7 @@ def check_aggregate(self, variable, components=None, method='sum',
             return IamDataFrame(diff, variable=variable).timeseries()
 
     def aggregate_region(self, variable, region='World', subregions=None,
-                         components=False, method='sum', weights=None,
+                         components=False, method='sum', weight=None,
                          append=False):
         """Compute the aggregate of timeseries over a number of regions
         including variable components only defined at the `region` level
@@ -863,14 +863,14 @@ def aggregate_region(self, variable, region='World', subregions=None,
             or explicit list of variables
         method: func or str, default 'sum'
             method to use for aggregation, e.g. np.mean, np.sum, 'min', 'max'
-        weights: str, default None
-            variable to use as weights for the aggregation
+        weight: str, default None
+            variable to use as weight for the aggregation
             (currently only supported with `method='sum'`)
         append: bool, default False
             append the aggregate timeseries to `data` and return None,
             else return aggregate timeseries
         """
-        if weights is not None and components is not False:
+        if weight is not None and components is not False:
             msg = 'using weights and components in one operation not supported'
             raise ValueError(msg)
 
@@ -888,12 +888,12 @@ def aggregate_region(self, variable, region='World', subregions=None,
         subregion_df = self.filter(region=subregions)
         cols = ['region', 'variable']
         rows = subregion_df._apply_filters(variable=variable)
-        if weights is None:
+        if weight is None:
             _data = _aggregate(subregion_df.data[rows], cols, method=method)
         else:
-            weight_rows = subregion_df._apply_filters(variable=weights)
-            _data = _aggregate_weights(subregion_df.data[rows],
-                                       subregion_df.data[weight_rows], method)
+            weight_rows = subregion_df._apply_filters(variable=weight)
+            _data = _aggregate_weight(subregion_df.data[rows],
+                                      subregion_df.data[weight_rows], method)
 
         # if not `components=False`, add components at the `region` level
         if components is not False:
@@ -919,7 +919,7 @@ def aggregate_region(self, variable, region='World', subregions=None,
             return _data
 
     def check_aggregate_region(self, variable, region='World', subregions=None,
-                               components=False, method='sum', weights=None,
+                               components=False, method='sum', weight=None,
                                exclude_on_fail=False, **kwargs):
         """Check whether the region timeseries data match the aggregation
         of components
@@ -939,8 +939,8 @@ def check_aggregate_region(self, variable, region='World', subregions=None,
             or explicit list of variables
         method: func or str, default 'sum'
             method to use for aggregation, e.g. np.mean, np.sum, 'min', 'max'
-        weights: str, default None
-            variable to use as weights for the aggregation
+        weight: str, default None
+            variable to use as weight for the aggregation
             (currently only supported with `method='sum'`)
         exclude_on_fail: boolean, default False
             flag scenarios failing validation as `exclude: True`
@@ -948,7 +948,7 @@ def check_aggregate_region(self, variable, region='World', subregions=None,
         """
         # compute aggregate from subregions, return None if no subregions
         df_subregions = self.aggregate_region(variable, region, subregions,
-                                              components, method, weights)
+                                              components, method, weight)
         if df_subregions is None:
             return
 
@@ -1490,14 +1490,14 @@ def _aggregate(df, by, method=np.sum):
     return df.groupby(cols)['value'].agg(_get_method_func(method))
 
 
-def _aggregate_weights(df, weights, method):
+def _aggregate_weight(df, weight, method):
     """Aggregate `df` by regions with weights, return indexed `pd.Series`"""
     # only summation allowed with weights
     if method not in ['sum', np.sum]:
         raise ValueError('only method `np.sum` allowed for weighted average')
 
     _data = _get_value_col(df, YEAR_IDX)
-    _weight = _get_value_col(weights, YEAR_IDX)
+    _weight = _get_value_col(weight, YEAR_IDX)
 
     cols = META_IDX + ['year']
     return (_data * _weight).groupby(cols).sum() / _weight.groupby(cols).sum()

From 08ce7093a9419d23259bac54e9ede9591d3e59f0 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Mon, 16 Dec 2019 11:21:45 +0100
Subject: [PATCH 16/26] update docstrings (preparing for new test data)

---
 tests/conftest.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 42e61c0a3..0d67d8a4d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -181,7 +181,7 @@
 TEST_TIME_STR_HR = ['2005-06-17 00:00:00', '2010-07-21 12:00:00']
 
 
-# IamDataFrame with four different time formats
+# minimal IamDataFrame with four different time formats
 @pytest.fixture(
     scope="function",
     params=[
@@ -198,14 +198,14 @@ def test_df(request):
     yield df
 
 
-# IamDataFrame for testing specifically for 'year'-column feature
+# minimal IamDataFrame for specifically testing 'year'-column features
 @pytest.fixture(scope="function")
 def test_df_year():
     df = IamDataFrame(data=TEST_DF)
     yield df
 
 
-# standard test data as pandas.DataFrame (only 'year' time format)
+# minimal test data provided as pandas.DataFrame (only 'year' time format)
 @pytest.fixture(scope="function")
 def test_pd_df():
     yield TEST_DF.copy()

From 55f21c0c01c6425c6448ee59c2e81a5f6218594d Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Mon, 16 Dec 2019 22:31:56 +0100
Subject: [PATCH 17/26] add unit test for `check_aggregate_region()`

---
 tests/conftest.py               | 33 +++++++++++++++++++++++++++++++++
 tests/test_feature_aggregate.py | 24 ++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/tests/conftest.py b/tests/conftest.py
index 0d67d8a4d..ccf4bf2f9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -25,6 +25,32 @@
 )
 
 
+FULL_FEATURE_DF = pd.DataFrame([
+    ['World', 'Primary Energy', 'EJ/y', 10, 15],
+    ['reg_a', 'Primary Energy', 'EJ/y', 6, 9],
+    ['reg_b', 'Primary Energy', 'EJ/y', 4, 6],
+    ['World', 'Primary Energy|Coal', 'EJ/y', 7, 10],
+    ['reg_a', 'Primary Energy|Coal', 'EJ/y', 5, 7],
+    ['reg_b', 'Primary Energy|Coal', 'EJ/y', 2, 3],
+    ['World', 'Primary Energy|Wind', 'EJ/y', 3, 5],
+    ['reg_a', 'Primary Energy|Wind', 'EJ/y', 1, 2],
+    ['reg_b', 'Primary Energy|Wind', 'EJ/y', 2, 3],
+    ['World', 'Emissions|CO2', 'EJ/y', 10, 14],
+    ['World', 'Emissions|CO2|Bunkers', 'EJ/y', 1, 2],
+    ['reg_a', 'Emissions|CO2', 'EJ/y', 6, 8],
+    ['reg_a', 'Emissions|CO2|Energy', 'EJ/y', 4, 5],
+    ['reg_a', 'Emissions|CO2|AFOLU', 'EJ/y', 2, 3],
+    ['reg_b', 'Emissions|CO2', 'EJ/y', 3, 4],
+    ['reg_b', 'Emissions|CO2|Energy', 'EJ/y', 2, 3],
+    ['reg_b', 'Emissions|CO2|AFOLU', 'EJ/y', 1, 1],
+    ['World', 'Price|Carbon', 'USD/tCO2', 4, 27],
+    ['reg_a', 'Price|Carbon', 'USD/tCO2', 1, 30],
+    ['reg_b', 'Price|Carbon', 'USD/tCO2', 10, 21],
+],
+    columns=['region', 'variable', 'unit', 2005, 2010],
+)
+
+
 REG_DF = pd.DataFrame([
     ['IMAGE', 'a_scenario', 'NAF', 'Primary Energy', 'EJ/y', 1, 6],
     ['IMAGE', 'a_scenario', 'ME', 'Primary Energy', 'EJ/y', 2, 7],
@@ -211,6 +237,13 @@ def test_pd_df():
     yield TEST_DF.copy()
 
 
+# IamDataFrame with variable-and-region-structure for testing aggregation tools
+@pytest.fixture(scope="function")
+def aggregate_df():
+    df = IamDataFrame(model='model_a', scenario='scen_a', data=FULL_FEATURE_DF)
+    yield df
+
+
 @pytest.fixture(scope="function")
 def check_aggregate_df():
     df = IamDataFrame(data=CHECK_AGG_DF)
diff --git a/tests/test_feature_aggregate.py b/tests/test_feature_aggregate.py
index 3fe92141e..cd6583398 100644
--- a/tests/test_feature_aggregate.py
+++ b/tests/test_feature_aggregate.py
@@ -9,6 +9,30 @@
 from conftest import TEST_DTS
 
 
+def test_aggregate_region(aggregate_df):
+    df = aggregate_df
+
+    # Primary energy is a direct sum
+    assert df.check_aggregate_region('Primary Energy') is None
+
+    # CO2 emissions have "bunkers" only defined at the region level
+    v = 'Emissions|CO2'
+    assert df.check_aggregate_region(v) is not None
+    assert df.check_aggregate_region(v, components=True) is None
+
+    # rename emissions of bunker to test setting components as list
+    _df = df.rename(variable={'Emissions|CO2|Bunkers': 'foo'})
+    assert _df.check_aggregate_region(v, components=['foo']) is None
+
+    # Carbon price has to be weighted by emissions
+    assert df.check_aggregate_region('Price|Carbon') is not None
+    assert df.check_aggregate_region('Price|Carbon', weight=v) is None
+
+    # setting both weight and components raises an error
+    pytest.raises(ValueError, df.aggregate_region, v, components=True,
+                  weight='bar')
+
+
 def test_missing_region(check_aggregate_df):
     # for now, this test makes sure that this operation works as expected
     exp = check_aggregate_df.aggregate_region('Primary Energy', region='foo')

From 14de79f7c1fbf6ac657d7bf5678d1ac069836750 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Mon, 16 Dec 2019 22:54:40 +0100
Subject: [PATCH 18/26] add test for `method` kwarg in
 `check_aggregate_region()`

---
 tests/test_feature_aggregate.py | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/tests/test_feature_aggregate.py b/tests/test_feature_aggregate.py
index cd6583398..c0b2ad95a 100644
--- a/tests/test_feature_aggregate.py
+++ b/tests/test_feature_aggregate.py
@@ -12,7 +12,7 @@
 def test_aggregate_region(aggregate_df):
     df = aggregate_df
 
-    # Primary energy is a direct sum
+    # primary energy is a direct sum
     assert df.check_aggregate_region('Primary Energy') is None
 
     # CO2 emissions have "bunkers" only defined at the region level
@@ -24,7 +24,7 @@ def test_aggregate_region(aggregate_df):
     _df = df.rename(variable={'Emissions|CO2|Bunkers': 'foo'})
     assert _df.check_aggregate_region(v, components=['foo']) is None
 
-    # Carbon price has to be weighted by emissions
+    # carbon price has to be weighted by emissions
     assert df.check_aggregate_region('Price|Carbon') is not None
     assert df.check_aggregate_region('Price|Carbon', weight=v) is None
 
@@ -32,6 +32,27 @@ def test_aggregate_region(aggregate_df):
     pytest.raises(ValueError, df.aggregate_region, v, components=True,
                   weight='bar')
 
+    # use other method (max) both as string and passing the function
+    idx = ['model', 'scenario', 'unit', 'year']
+    exp = pd.DataFrame([
+        ['model_a', 'scen_a', 'USD/tCO2', 2005, 10.0],
+        ['model_a', 'scen_a', 'USD/tCO2', 2010, 30.0]
+    ],
+        columns=idx+['value']
+    ).set_index(idx).value
+    obs = df.aggregate_region('Price|Carbon', method='max')
+    pd.testing.assert_series_equal(obs, exp)
+
+    obs = df.aggregate_region('Price|Carbon', method=np.max)
+    pd.testing.assert_series_equal(obs, exp)
+
+    # using illegal method raises an error
+    pytest.raises(ValueError, df.aggregate_region, v, method='foo')
+
+    # using weight and method other than 'sum' raises an error
+    pytest.raises(ValueError, df.aggregate_region, v, method='max',
+                  weight='bar')
+
 
 def test_missing_region(check_aggregate_df):
     # for now, this test makes sure that this operation works as expected

From a3901a1d396e2c48dd216e995929585da79259ff Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Tue, 17 Dec 2019 08:42:49 +0100
Subject: [PATCH 19/26] raise if variable & weight of inconsistent index in
 `aggregate_region()`

---
 pyam/core.py                    | 8 ++++++--
 tests/test_feature_aggregate.py | 7 ++++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/pyam/core.py b/pyam/core.py
index 65078ff07..6c9519a81 100755
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -1499,6 +1499,9 @@ def _aggregate_weight(df, weight, method):
     _data = _get_value_col(df, YEAR_IDX)
     _weight = _get_value_col(weight, YEAR_IDX)
 
+    if not _data.index.equals(_weight.index):
+        raise ValueError('inconsistent index between variable and weight')
+
     cols = META_IDX + ['year']
     return (_data * _weight).groupby(cols).sum() / _weight.groupby(cols).sum()
 
@@ -1516,8 +1519,9 @@ def _get_method_func(method):
 
 
 def _get_value_col(df, cols):
-    """Return the value column as `pd.Series with `cols` as index"""
-    return df.set_index(cols)['value']
+    """Return the value column as `pd.Series sorted by index"""
+    return df.set_index(cols)['value'].sort_index()
+
 
 def _raise_filter_error(col):
     raise ValueError('filter by `{}` not supported'.format(col))
diff --git a/tests/test_feature_aggregate.py b/tests/test_feature_aggregate.py
index c0b2ad95a..d89743a80 100644
--- a/tests/test_feature_aggregate.py
+++ b/tests/test_feature_aggregate.py
@@ -24,10 +24,15 @@ def test_aggregate_region(aggregate_df):
     _df = df.rename(variable={'Emissions|CO2|Bunkers': 'foo'})
     assert _df.check_aggregate_region(v, components=['foo']) is None
 
-    # carbon price has to be weighted by emissions
+    # carbon price shouldn't be summed but be weighted by emissions
     assert df.check_aggregate_region('Price|Carbon') is not None
     assert df.check_aggregate_region('Price|Carbon', weight=v) is None
 
+    # inconsistent index of variable and weight raises an error
+    _df = df.filter(variable='Emissions|CO2', region='reg_b', keep=False)
+    pytest.raises(ValueError, _df.aggregate_region, 'Price|Carbon',
+                  weight='Emissions|CO2')
+
     # setting both weight and components raises an error
     pytest.raises(ValueError, df.aggregate_region, v, components=True,
                   weight='bar')

From 9688aa14fe6a2a7d61c2ed0683fd26a295367bf4 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Tue, 17 Dec 2019 08:45:16 +0100
Subject: [PATCH 20/26] make full-agg-feature test data complete

---
 tests/conftest.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/conftest.py b/tests/conftest.py
index ccf4bf2f9..9b11121ce 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -36,6 +36,8 @@
     ['reg_a', 'Primary Energy|Wind', 'EJ/y', 1, 2],
     ['reg_b', 'Primary Energy|Wind', 'EJ/y', 2, 3],
     ['World', 'Emissions|CO2', 'EJ/y', 10, 14],
+    ['World', 'Emissions|CO2|Energy', 'EJ/y', 6, 8],
+    ['World', 'Emissions|CO2|AFOLU', 'EJ/y', 3, 4],
     ['World', 'Emissions|CO2|Bunkers', 'EJ/y', 1, 2],
     ['reg_a', 'Emissions|CO2', 'EJ/y', 6, 8],
     ['reg_a', 'Emissions|CO2|Energy', 'EJ/y', 4, 5],

From 63a5d8a8221cf73db071b7abd0f7343bed666822 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Tue, 17 Dec 2019 09:01:41 +0100
Subject: [PATCH 21/26] add tests for `aggregate()`

---
 tests/test_feature_aggregate.py | 37 ++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/tests/test_feature_aggregate.py b/tests/test_feature_aggregate.py
index d89743a80..c4d120117 100644
--- a/tests/test_feature_aggregate.py
+++ b/tests/test_feature_aggregate.py
@@ -9,10 +9,45 @@
 from conftest import TEST_DTS
 
 
+def test_aggregate(aggregate_df):
+    df = aggregate_df
+
+    # primary energy is a direct sum (within each region)
+    assert df.check_aggregate('Primary Energy') is None
+
+    # rename sub-category to test setting components as list
+    _df = df.rename(variable={'Primary Energy|Wind':'foo'})
+    assert _df.check_aggregate('Primary Energy') is not None
+    components = ['Primary Energy|Coal', 'foo']
+    assert _df.check_aggregate('Primary Energy', components=components) is None
+
+    # use other method (max) both as string and passing the function
+    idx = ['model', 'scenario', 'region', 'unit', 'year']
+    exp = pd.DataFrame([
+        ['model_a', 'scen_a', 'World', 'EJ/y', 2005, 7.0],
+        ['model_a', 'scen_a', 'World', 'EJ/y', 2010, 10.0],
+        ['model_a', 'scen_a', 'reg_a', 'EJ/y', 2005, 5.0],
+        ['model_a', 'scen_a', 'reg_a', 'EJ/y', 2010, 7.0],
+        ['model_a', 'scen_a', 'reg_b', 'EJ/y', 2005, 2.0],
+        ['model_a', 'scen_a', 'reg_b', 'EJ/y', 2010, 3.0],
+
+    ],
+        columns=idx+['value']
+    ).set_index(idx).value
+    obs = df.aggregate('Primary Energy', method='max')
+    pd.testing.assert_series_equal(obs, exp)
+
+    obs = df.aggregate('Primary Energy', method=np.max)
+    pd.testing.assert_series_equal(obs, exp)
+
+    # using illegal method raises an error
+    pytest.raises(ValueError, df.aggregate, 'Primary Energy', method='foo')
+
+
 def test_aggregate_region(aggregate_df):
     df = aggregate_df
 
-    # primary energy is a direct sum
+    # primary energy is a direct sum (across regions)
     assert df.check_aggregate_region('Primary Energy') is None
 
     # CO2 emissions have "bunkers" only defined at the region level

From ba71cc77590bee2452fc98dbca4cba874b8e767d Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Tue, 17 Dec 2019 09:23:40 +0100
Subject: [PATCH 22/26] appease stickler

---
 pyam/core.py                    |  2 +-
 tests/test_feature_aggregate.py | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/pyam/core.py b/pyam/core.py
index 6c9519a81..4f05dd664 100755
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -1019,7 +1019,7 @@ def check_internal_consistency(self, **kwargs):
                 inconsistent_vars[variable + "-aggregate"] = diff_agg
 
             diff_regional = self.check_aggregate_region(variable,
-                        components=True, **kwargs)
+                components=True, **kwargs)
             if diff_regional is not None:
                 inconsistent_vars[variable + "-regional"] = diff_regional
 
diff --git a/tests/test_feature_aggregate.py b/tests/test_feature_aggregate.py
index c4d120117..c894ef65d 100644
--- a/tests/test_feature_aggregate.py
+++ b/tests/test_feature_aggregate.py
@@ -3,7 +3,6 @@
 
 import numpy as np
 import pandas as pd
-import pytest
 from pyam import check_aggregate, IamDataFrame, IAMC_IDX
 
 from conftest import TEST_DTS
@@ -16,7 +15,7 @@ def test_aggregate(aggregate_df):
     assert df.check_aggregate('Primary Energy') is None
 
     # rename sub-category to test setting components as list
-    _df = df.rename(variable={'Primary Energy|Wind':'foo'})
+    _df = df.rename(variable={'Primary Energy|Wind': 'foo'})
     assert _df.check_aggregate('Primary Energy') is not None
     components = ['Primary Energy|Coal', 'foo']
     assert _df.check_aggregate('Primary Energy', components=components) is None
@@ -32,7 +31,7 @@ def test_aggregate(aggregate_df):
         ['model_a', 'scen_a', 'reg_b', 'EJ/y', 2010, 3.0],
 
     ],
-        columns=idx+['value']
+        columns=idx + ['value']
     ).set_index(idx).value
     obs = df.aggregate('Primary Energy', method='max')
     pd.testing.assert_series_equal(obs, exp)
@@ -78,7 +77,7 @@ def test_aggregate_region(aggregate_df):
         ['model_a', 'scen_a', 'USD/tCO2', 2005, 10.0],
         ['model_a', 'scen_a', 'USD/tCO2', 2010, 30.0]
     ],
-        columns=idx+['value']
+        columns=idx + ['value']
     ).set_index(idx).value
     obs = df.aggregate_region('Price|Carbon', method='max')
     pd.testing.assert_series_equal(obs, exp)
@@ -214,11 +213,12 @@ def test_df_check_aggregate_pass(check_aggregate_df):
 
 
 def test_df_check_aggregate_region_pass(check_aggregate_df):
-    obs = check_aggregate_df.check_aggregate_region('Primary Energy', components=True)
+    comp = dict(components=True)
+    obs = check_aggregate_df.check_aggregate_region('Primary Energy', **comp)
     assert obs is None
 
     for variable in check_aggregate_df.variables():
-        obs = check_aggregate_df.check_aggregate_region(variable, components=True)
+        obs = check_aggregate_df.check_aggregate_region(variable, **comp)
         assert obs is None
 
 

From 2d50535ca391b4bf8942bc930a71cf64a7f8e5b6 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Tue, 17 Dec 2019 09:25:20 +0100
Subject: [PATCH 23/26] appease stickler again

---
 pyam/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyam/core.py b/pyam/core.py
index 4f05dd664..8f69f3710 100755
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -1019,7 +1019,7 @@ def check_internal_consistency(self, **kwargs):
                 inconsistent_vars[variable + "-aggregate"] = diff_agg
 
             diff_regional = self.check_aggregate_region(variable,
-                components=True, **kwargs)
+                    components=True, **kwargs)
             if diff_regional is not None:
                 inconsistent_vars[variable + "-regional"] = diff_regional
 

From c58200306001aa2f1e5ed85299082dab5dbe1f09 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Tue, 17 Dec 2019 09:26:46 +0100
Subject: [PATCH 24/26] third time stickler

---
 pyam/core.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pyam/core.py b/pyam/core.py
index 8f69f3710..88a70057f 100755
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -1018,8 +1018,10 @@ def check_internal_consistency(self, **kwargs):
             if diff_agg is not None:
                 inconsistent_vars[variable + "-aggregate"] = diff_agg
 
-            diff_regional = self.check_aggregate_region(variable,
-                    components=True, **kwargs)
+            diff_regional = (
+                self.check_aggregate_region(variable, components=True,
+                                            **kwargs)
+            )
             if diff_regional is not None:
                 inconsistent_vars[variable + "-regional"] = diff_regional
 

From ee630aac90e2e3de958d6e483063afb555357df7 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Mon, 23 Dec 2019 12:55:14 +0100
Subject: [PATCH 25/26] add `mean` to KNOWN_FUNCS (review comment by @gidden)

---
 pyam/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyam/utils.py b/pyam/utils.py
index 9a30fbe3d..c0445ea0d 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -33,7 +33,8 @@
                           + ['{}{}'.format(i, j) for i, j in itertools.product(
                               string.ascii_uppercase, string.ascii_uppercase)]))
 
-KNOWN_FUNCS = {'min': np.min, 'max': np.max, 'avg': np.mean, 'sum': np.sum}
+KNOWN_FUNCS = {'min': np.min, 'max': np.max, 'avg': np.mean, 'mean': np.mean,
+               'sum': np.sum}
 
 
 def requires_package(pkg, msg, error_type=ImportError):

From 01637b78d509a5226f1fa2d5b742afe121eb53da Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Mon, 23 Dec 2019 12:55:22 +0100
Subject: [PATCH 26/26] add to release notes

---
 RELEASE_NOTES.md | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 9e6f437a9..4cda6f27d 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -1,6 +1,14 @@
-
 # Next Release
 
+## API changes
+
+PR [#305](https://github.com/IAMconsortium/pyam/pull/305) changed the default
+behaviour of `aggregate_region()` regarding the treatment of components at the
+region-level. To keep the previous behaviour, add `components=True`.
+
+## Individual Updates
+
+- [#305](https://github.com/IAMconsortium/pyam/pull/305) Add `method` and `weight` options to the (region) aggregation functions
 - [#302](https://github.com/IAMconsortium/pyam/pull/302) Rework the tutorials
 - [#301](https://github.com/IAMconsortium/pyam/pull/301) Bugfix when using `to_excel()` with a `pd.ExcelWriter`
 - [#297](https://github.com/IAMconsortium/pyam/pull/297) Add `empty` attribute, better error for `timeseries()` on empty dataframe 
@@ -8,6 +16,7 @@
 - [#292](https://github.com/IAMconsortium/pyam/pull/292) Add warning message if `data` is empty at initialization (after formatting)
 - [#288](https://github.com/IAMconsortium/pyam/pull/288) Put `pyam` logger in its own namespace (see [here](https://docs.python-guide.org/writing/logging/#logging-in-a-library>))
 - [#285](https://github.com/IAMconsortium/pyam/pull/285) Add ability to fetch regions with synonyms from IXMP API
+
 # Release v0.3.0
 
 ## Highlights