Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add ability to read rcp db data #70

Merged
merged 5 commits into from
Jul 17, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@
# Next Release

- (#73)[https://github.com/IAMconsortium/pyam/pull/73] Adds ability to remove labels for markers, colors, or linestyles
- (#70)[https://github.com/IAMconsortium/pyam/pull/70] Support reading of both SSP and RCP data files downloaded from the IIASA database.
- (#66)[https://github.com/IAMconsortium/pyam/pull/66] Fixes a bug in the `interpolate()` function (duplication of data points if already defined)
- (#65)[https://github.com/IAMconsortium/pyam/pull/65] Add a `filter_by_meta()` function to filter/join a pd.DataFrame with an IamDataFrame.meta table
23 changes: 14 additions & 9 deletions pyam/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,12 @@ def __init__(self, data, **kwargs):
----------
data: ixmp.TimeSeries, ixmp.Scenario, pd.DataFrame or data file
an instance of an TimeSeries or Scenario (requires `ixmp`),
or pd.DataFrame or data file with IAMC-format data columns
or pd.DataFrame or data file with IAMC-format data columns.

Special support is provided for data files downloaded directly from
IIASA SSP and RCP databases. If you run into any problems loading
data, please make an issue at:
https://github.com/IAMconsortium/pyam/issues
"""
# import data from pd.DataFrame or read from source
if isinstance(data, pd.DataFrame):
Expand Down Expand Up @@ -298,11 +303,11 @@ def set_meta(self, meta, name=None, index=None):

# reduce index dimensions to model-scenario only
_meta = (
_meta
.reset_index()
.reindex(columns=META_IDX + [name])
.set_index(META_IDX)
)
_meta
.reset_index()
.reindex(columns=META_IDX + [name])
.set_index(META_IDX)
)

# raise error if index is not unique
if _meta.index.duplicated().any():
Expand Down Expand Up @@ -499,9 +504,9 @@ def check_aggregate(self, variable, components=None, units=None,

# filter and groupby data, use `pd.Series.align` for machting index
df_variable, df_components = (
_aggregate_by_variables(self.data, variable, units)
.align(_aggregate_by_variables(self.data, components, units))
)
_aggregate_by_variables(self.data, variable, units)
.align(_aggregate_by_variables(self.data, components, units))
)

# use `np.isclose` for checking match
diff = df_variable[~np.isclose(df_variable, multiplier * df_components,
Expand Down
23 changes: 16 additions & 7 deletions pyam/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,25 +146,34 @@ def read_files(fnames, *args, **kwargs):
for fname in fnames:
logger().info('Reading `{}`'.format(fname))
df = read_pandas(fname, *args, **kwargs)
dfs.append(format_data(df))
df = format_data(df)
dfs.append(df)

return pd.concat(dfs)


def format_data(df):
"""Convert an imported dataframe and check all required columns"""
# all lower case
df.rename(columns={c: str(c).lower() for c in df.columns}, inplace=True)

if 'notes' in df.columns: # this came from the database
logger().info('Ignoring notes column in dataframe')
df.drop(columns='notes', inplace=True)
col = df.columns[0] # first column has database copyright notice
df = df[~df[col].str.contains('database', case=False)]
if 'scenario' in df.columns and 'model' not in df.columns:
# model and scenario are jammed together in RCP data
scen = df['scenario']
df['model'] = scen.apply(lambda s: s.split('-')[0].strip())
df['scenario'] = scen.apply(
lambda s: '-'.join(s.split('-')[1:]).strip())

# format columns to lower-case and check that all required columns exist
df.rename(columns={c: str(c).lower() for c in df.columns}, inplace=True)
if not set(IAMC_IDX).issubset(set(df.columns)):
missing = list(set(IAMC_IDX) - set(df.columns))
raise ValueError("missing required columns `{}`!".format(missing))

if 'notes' in df.columns:
logger().info('Ignoring notes column in dataframe')
df.drop(columns='notes', inplace=True)
df = df[~df.model.str.contains('database', case=False)]

# check whether data in IAMC style or year/value layout
if 'value' not in df.columns:
numcols = sorted(set(df.columns) - set(IAMC_IDX))
Expand Down
Binary file added tests/data/test_RCP_database_raw_download.xlsx
Binary file not shown.
12 changes: 10 additions & 2 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,8 @@ def test_check_aggregate_pass(meta_df):
'region': 'World', 'variable': 'Primary Energy|Gas',
'unit': 'EJ/y', 'year': [2005, 2010], 'value': [.5, 3]})
meta_df.data = meta_df.data.append(df, ignore_index=True)
obs = meta_df.filter(scenario='a_scenario').check_aggregate('Primary Energy')
obs = meta_df.filter(
scenario='a_scenario').check_aggregate('Primary Energy')
assert obs is None


Expand All @@ -258,7 +259,7 @@ def test_category_pass(meta_df):
exp = pd.DataFrame(dct).set_index(['model', 'scenario'])['category']

meta_df.categorize('category', 'foo', {'Primary Energy':
{'up': 6, 'year': 2010}})
{'up': 6, 'year': 2010}})
obs = meta_df['category']
pd.testing.assert_series_equal(obs, exp)

Expand Down Expand Up @@ -294,6 +295,13 @@ def test_load_SSP_database_downloaded_file(test_df):
pd.testing.assert_frame_equal(obs_df.as_pandas(), test_df.as_pandas())


def test_load_RCP_database_downloaded_file(test_df):
obs_df = IamDataFrame(os.path.join(
TEST_DATA_DIR, 'test_RCP_database_raw_download.xlsx')
)
pd.testing.assert_frame_equal(obs_df.as_pandas(), test_df.as_pandas())


def test_append(test_df):
df2 = test_df.append(other=os.path.join(
TEST_DATA_DIR, 'testing_data_2.csv'))
Expand Down