IAMconsortium · danielhuppmann · Dec 23, 2019 · Dec 13, 2019 · Dec 13, 2019 · Dec 13, 2019
diff --git a/doc/source/tutorials/checking_databases.ipynb b/doc/source/tutorials/checking_databases.ipynb
@@ -4059,7 +4059,7 @@
    "source": [
     "for variable in consistent_df.filter(level=1).variables():\n",
     "    diff = consistent_df.check_aggregate_region(\n",
-    "        variable, \n",
+    "        variable, components=True,\n",
     "        **np_isclose_args\n",
     "    )\n",
     "    assert diff is None"

diff --git a/pyam/core.py b/pyam/core.py
@@ -38,6 +38,7 @@
     YEAR_IDX,
     IAMC_IDX,
     SORT_IDX,
+    KNOWN_FUNCS
 )
 from pyam.read_ixmp import read_ix
 from pyam.timeseries import fill_series
@@ -766,7 +767,7 @@ def normalize(self, inplace=False, **kwargs):
         if not inplace:
             return ret
 
-    def aggregate(self, variable, components=None, append=False):
+    def aggregate(self, variable, components=None, method='sum', append=False):
         """Compute the aggregate of timeseries components or sub-categories
 
         Parameters
@@ -775,6 +776,8 @@ def aggregate(self, variable, components=None, append=False):
             variable for which the aggregate should be computed
         components: list of str, default None
             list of variables, defaults to all sub-categories of `variable`
+        method: func or str, default 'sum'
+            method to use for aggregation, e.g. np.mean, np.sum, 'min', 'max'
         append: bool, default False
             append the aggregate timeseries to `data` and return None,
             else return aggregate timeseries
@@ -789,15 +792,15 @@ def aggregate(self, variable, components=None, append=False):
             return
 
         rows = self._apply_filters(variable=components)
-        _data = _aggregate(self.data[rows], 'variable')
+        _data = _aggregate(self.data[rows], 'variable', method)
 
         if append is True:
             self.append(_data, variable=variable, inplace=True)
         else:
             return _data
 
-    def check_aggregate(self, variable, components=None, exclude_on_fail=False,
-                        multiplier=1, **kwargs):
+    def check_aggregate(self, variable, components=None, method='sum',
+                        exclude_on_fail=False, multiplier=1, **kwargs):
         """Check whether a timeseries matches the aggregation of its components
 
         Parameters
@@ -806,6 +809,8 @@ def check_aggregate(self, variable, components=None, exclude_on_fail=False,
             variable to be checked for matching aggregation of sub-categories
         components: list of str, default None
             list of variables, defaults to all sub-categories of `variable`
+        method: func or str, default 'sum'
+            method to use for aggregation, e.g. np.mean, np.sum, 'min', 'max'
         exclude_on_fail: boolean, default False
             flag scenarios failing validation as `exclude: True`
         multiplier: number, default 1
@@ -820,7 +825,8 @@ def check_aggregate(self, variable, components=None, exclude_on_fail=False,
         # filter and groupby data, use `pd.Series.align` for matching index
         rows = self._apply_filters(variable=variable)
         df_variable, df_components = (
-            _aggregate(self.data[rows], 'variable').align(df_components)
+            _aggregate(self.data[rows], 'variable', method)
+            .align(df_components)
         )
 
         # use `np.isclose` for checking match
@@ -837,7 +843,8 @@ def check_aggregate(self, variable, components=None, exclude_on_fail=False,
             return IamDataFrame(diff, variable=variable).timeseries()
 
     def aggregate_region(self, variable, region='World', subregions=None,
-                         components=None, append=False):
+                         components=False, method='sum', weight=None,
+                         append=False):
         """Compute the aggregate of timeseries over a number of regions
         including variable components only defined at the `region` level
 
@@ -849,18 +856,26 @@ def aggregate_region(self, variable, region='World', subregions=None,
             dimension
         subregions: list of str
             list of subregions, defaults to all regions other than `region`
-        components: list of str
-            list of variables to include in the aggregate from the `region`
-            level, defaults to all sub-categories of `variable` included in
-            `region` but not in any of `subregions`
+        components: bool or list of str, default False
+            variables at the `region` level to be included in the aggregation
+            (ignored if False); if `True`, use all sub-categories of `variable`
+            included in `region` but not in any of the `subregions`;
+            or explicit list of variables
+        method: func or str, default 'sum'
+            method to use for aggregation, e.g. np.mean, np.sum, 'min', 'max'
+        weight: str, default None
+            variable to use as weight for the aggregation
+            (currently only supported with `method='sum'`)
         append: bool, default False
             append the aggregate timeseries to `data` and return None,
             else return aggregate timeseries
         """
+        if weight is not None and components is not False:
+            msg = 'using weights and components in one operation not supported'
+            raise ValueError(msg)
+
         # default subregions to all regions other than `region`
-        if subregions is None:
-            rows = self._apply_filters(variable=variable)
-            subregions = set(self.data[rows].region) - set([region])
+        subregions = subregions or self._all_other_regions(region, variable)
 
         if not len(subregions):
             msg = 'cannot aggregate variable `{}` to `{}` because it does not'\
@@ -872,30 +887,40 @@ def aggregate_region(self, variable, region='World', subregions=None,
         # compute aggregate over all subregions
         subregion_df = self.filter(region=subregions)
         cols = ['region', 'variable']
-        _data = _aggregate(subregion_df.filter(variable=variable).data, cols)
-
-        # add components at the `region` level, defaults to all variables one
-        # level below `variable` that are only present in `region`
-        with adjust_log_level(logger):
-            region_df = self.filter(region=region)
-
-        rdf_comps = region_df._variable_components(variable, level=None)
-        srdf_comps = subregion_df._variable_components(variable, level=None)
-        components = components or set(rdf_comps).difference(srdf_comps)
-
-        if len(components):
-            rows = region_df._apply_filters(variable=components)
-            _data = _data.add(_aggregate(region_df.data[rows], cols),
-                              fill_value=0)
+        rows = subregion_df._apply_filters(variable=variable)
+        if weight is None:
+            _data = _aggregate(subregion_df.data[rows], cols, method=method)
+        else:
+            weight_rows = subregion_df._apply_filters(variable=weight)
+            _data = _aggregate_weight(subregion_df.data[rows],
+                                      subregion_df.data[weight_rows], method)
+
+        # if not `components=False`, add components at the `region` level
+        if components is not False:
+            with adjust_log_level(logger):
+                region_df = self.filter(region=region)
+
+            # if `True`, auto-detect `components` at the `region` level,
+            # defaults to variables below `variable` only present in `region`
+            if components is True:
+                level = dict(level=None)
+                r_comps = region_df._variable_components(variable, **level)
+                sr_comps = subregion_df._variable_components(variable, **level)
+                components = set(r_comps).difference(sr_comps)
+
+            if len(components):
+                rows = region_df._apply_filters(variable=components)
+                _data = _data.add(_aggregate(region_df.data[rows], cols),
+                                  fill_value=0)
 
         if append is True:
             self.append(_data, region=region, variable=variable, inplace=True)
         else:
             return _data
 
     def check_aggregate_region(self, variable, region='World', subregions=None,
-                               components=None, exclude_on_fail=False,
-                               **kwargs):
+                               components=False, method='sum', weight=None,
+                               exclude_on_fail=False, **kwargs):
         """Check whether the region timeseries data match the aggregation
         of components
 
@@ -907,16 +932,23 @@ def check_aggregate_region(self, variable, region='World', subregions=None,
             region to be checked for matching aggregation of subregions
         subregions: list of str
             list of subregions, defaults to all regions other than `region`
-        components: list of str, default None
-            list of variables, defaults to all sub-categories of `variable`
-            included in `region` but not in any of `subregions`
+        components: bool or list of str, default False
+            variables at the `region` level to be included in the aggregation
+            (ignored if False); if `True`, use all sub-categories of `variable`
+            included in `region` but not in any of the `subregions`;
+            or explicit list of variables
+        method: func or str, default 'sum'
+            method to use for aggregation, e.g. np.mean, np.sum, 'min', 'max'
+        weight: str, default None
+            variable to use as weight for the aggregation
+            (currently only supported with `method='sum'`)
         exclude_on_fail: boolean, default False
             flag scenarios failing validation as `exclude: True`
         kwargs: passed to `np.isclose()`
         """
         # compute aggregate from subregions, return None if no subregions
         df_subregions = self.aggregate_region(variable, region, subregions,
-                                              components)
+                                              components, method, weight)
         if df_subregions is None:
             return
 
@@ -947,6 +979,11 @@ def check_aggregate_region(self, variable, region='World', subregions=None,
             col_args = dict(region=region, variable=variable)
             return IamDataFrame(diff, **col_args).timeseries()
 
+    def _all_other_regions(self, region, variable):
+        """Return list of regions other than `region` containing `variable`"""
+        rows = self._apply_filters(variable=variable)
+        return set(self.data[rows].region) - set([region])
+
     def _variable_components(self, variable, level=0):
         """Get all components (sub-categories) of a variable for a given level
 
@@ -958,7 +995,7 @@ def _variable_components(self, variable, level=0):
                                       level=level)]
 
     def check_internal_consistency(self, **kwargs):
-        """Check whether the database is internally consistent
+        """Check whether a scenario ensemble is internally consistent
 
         We check that all variables are equal to the sum of their sectoral
         components and that all the regions add up to the World total. If
@@ -981,7 +1018,8 @@ def check_internal_consistency(self, **kwargs):
             if diff_agg is not None:
                 inconsistent_vars[variable + "-aggregate"] = diff_agg
 
-            diff_regional = self.check_aggregate_region(variable, **kwargs)
+            diff_regional = self.check_aggregate_region(variable,
+                components=True, **kwargs)
             if diff_regional is not None:
                 inconsistent_vars[variable + "-regional"] = diff_regional
 
@@ -1444,11 +1482,45 @@ def _meta_idx(data):
     return data[META_IDX].drop_duplicates().set_index(META_IDX).index
 
 
-def _aggregate(df, by):
+def _aggregate(df, by, method=np.sum):
     """Aggregate `df` by specified column(s), return indexed `pd.Series`"""
     by = [by] if isstr(by) else by
     cols = [c for c in list(df.columns) if c not in ['value'] + by]
-    return df.groupby(cols).sum()['value']
+    # pick aggregator func (default: sum)
+    return df.groupby(cols)['value'].agg(_get_method_func(method))
+
+
+def _aggregate_weight(df, weight, method):
+    """Aggregate `df` by regions with weights, return indexed `pd.Series`"""
+    # only summation allowed with weights
+    if method not in ['sum', np.sum]:
+        raise ValueError('only method `np.sum` allowed for weighted average')
+
+    _data = _get_value_col(df, YEAR_IDX)
+    _weight = _get_value_col(weight, YEAR_IDX)
+
+    if not _data.index.equals(_weight.index):
+        raise ValueError('inconsistent index between variable and weight')
+
+    cols = META_IDX + ['year']
+    return (_data * _weight).groupby(cols).sum() / _weight.groupby(cols).sum()
+
+
+def _get_method_func(method):
+    """Translate a string to a known method"""
+    if not isstr(method):
+        return method
+
+    if method in KNOWN_FUNCS:
+        return KNOWN_FUNCS[method]
+
+    # raise error if `method` is a string but not in dict of known methods
+    raise ValueError('method `{}` is not a known aggregator'.format(method))
+
+
+def _get_value_col(df, cols):
+    """Return the value column as `pd.Series sorted by index"""
+    return df.set_index(cols)['value'].sort_index()
 
 
 def _raise_filter_error(col):

diff --git a/pyam/utils.py b/pyam/utils.py
@@ -33,6 +33,8 @@
                           + ['{}{}'.format(i, j) for i, j in itertools.product(
                               string.ascii_uppercase, string.ascii_uppercase)]))
 
+KNOWN_FUNCS = {'min': np.min, 'max': np.max, 'avg': np.mean, 'sum': np.sum}
+
 
 def requires_package(pkg, msg, error_type=ImportError):
     """Decorator when a function requires an optional dependency

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -25,6 +25,34 @@
 )
 
 
+FULL_FEATURE_DF = pd.DataFrame([
+    ['World', 'Primary Energy', 'EJ/y', 10, 15],
+    ['reg_a', 'Primary Energy', 'EJ/y', 6, 9],
+    ['reg_b', 'Primary Energy', 'EJ/y', 4, 6],
+    ['World', 'Primary Energy|Coal', 'EJ/y', 7, 10],
+    ['reg_a', 'Primary Energy|Coal', 'EJ/y', 5, 7],
+    ['reg_b', 'Primary Energy|Coal', 'EJ/y', 2, 3],
+    ['World', 'Primary Energy|Wind', 'EJ/y', 3, 5],
+    ['reg_a', 'Primary Energy|Wind', 'EJ/y', 1, 2],
+    ['reg_b', 'Primary Energy|Wind', 'EJ/y', 2, 3],
+    ['World', 'Emissions|CO2', 'EJ/y', 10, 14],
+    ['World', 'Emissions|CO2|Energy', 'EJ/y', 6, 8],
+    ['World', 'Emissions|CO2|AFOLU', 'EJ/y', 3, 4],
+    ['World', 'Emissions|CO2|Bunkers', 'EJ/y', 1, 2],
+    ['reg_a', 'Emissions|CO2', 'EJ/y', 6, 8],
+    ['reg_a', 'Emissions|CO2|Energy', 'EJ/y', 4, 5],
+    ['reg_a', 'Emissions|CO2|AFOLU', 'EJ/y', 2, 3],
+    ['reg_b', 'Emissions|CO2', 'EJ/y', 3, 4],
+    ['reg_b', 'Emissions|CO2|Energy', 'EJ/y', 2, 3],
+    ['reg_b', 'Emissions|CO2|AFOLU', 'EJ/y', 1, 1],
+    ['World', 'Price|Carbon', 'USD/tCO2', 4, 27],
+    ['reg_a', 'Price|Carbon', 'USD/tCO2', 1, 30],
+    ['reg_b', 'Price|Carbon', 'USD/tCO2', 10, 21],
+],
+    columns=['region', 'variable', 'unit', 2005, 2010],
+)
+
+
 REG_DF = pd.DataFrame([
     ['IMAGE', 'a_scenario', 'NAF', 'Primary Energy', 'EJ/y', 1, 6],
     ['IMAGE', 'a_scenario', 'ME', 'Primary Energy', 'EJ/y', 2, 7],
@@ -177,47 +205,44 @@
 
 TEST_YEARS = [2005, 2010]
 TEST_DTS = [datetime(2005, 6, 17), datetime(2010, 7, 21)]
+TEST_TIME_STR = ['2005-06-17', '2010-07-21']
+TEST_TIME_STR_HR = ['2005-06-17 00:00:00', '2010-07-21 12:00:00']
 
 
+# minimal IamDataFrame with four different time formats
 @pytest.fixture(
     scope="function",
     params=[
         TEST_YEARS,
         TEST_DTS,
-        ['2005-06-17', '2010-07-21'],
-        ['2005-06-17 00:00:00', '2010-07-21 12:00:00']
+        TEST_TIME_STR,
+        TEST_TIME_STR_HR
     ]
 )
 def test_df(request):
-    tdf = TEST_DF.iloc[:2]
-    tdf = tdf.rename({2005: request.param[0], 2010: request.param[1]},
-                     axis="columns")
+    tdf = TEST_DF.rename({2005: request.param[0], 2010: request.param[1]},
+                         axis="columns")
     df = IamDataFrame(data=tdf)
     yield df
 
 
+# minimal IamDataFrame for specifically testing 'year'-column features
 @pytest.fixture(scope="function")
 def test_df_year():
-    df = IamDataFrame(data=TEST_DF.iloc[:2])
+    df = IamDataFrame(data=TEST_DF)
     yield df
 
 
+# minimal test data provided as pandas.DataFrame (only 'year' time format)
 @pytest.fixture(scope="function")
 def test_pd_df():
     yield TEST_DF.copy()
 
 
-@pytest.fixture(
-    scope="function",
-    params=[
-        TEST_YEARS,
-        TEST_DTS,
-    ]
-)
-def meta_df(request):
-    mdf = TEST_DF.rename({2005: request.param[0], 2010: request.param[1]},
-                         axis="columns")
-    df = IamDataFrame(data=mdf)
+# IamDataFrame with variable-and-region-structure for testing aggregation tools
+@pytest.fixture(scope="function")
+def aggregate_df():
+    df = IamDataFrame(model='model_a', scenario='scen_a', data=FULL_FEATURE_DF)
     yield df