Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backwards compatibility of test data with pymatgen #206

Merged
merged 6 commits into from
Apr 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 17 additions & 4 deletions modnet/featurizers/featurizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,10 @@ def _fit_apply_featurizers(
_featurizers.set_n_jobs(self._n_jobs)

return _featurizers.featurize_dataframe(
df, column, multiindex=True, ignore_errors=True
df,
column,
multiindex=True,
ignore_errors=getattr(self, "ignore_errors", True),
)
elif mode == "single":

Expand All @@ -164,7 +167,10 @@ def _fit_apply_featurizers(
)
start = time.monotonic_ns()
df = featurizer.featurize_dataframe(
df, column, multiindex=True, ignore_errors=True
df,
column,
multiindex=True,
ignore_errors=getattr(self, "ignore_errors", True),
)
LOG.info(
f"Applied featurizer {featurizer.__class__.__name__} to column {column!r} in {(time.monotonic_ns() - start) * 1e-9} seconds"
Expand Down Expand Up @@ -244,7 +250,11 @@ def featurize_composition(self, df: pd.DataFrame) -> pd.DataFrame:
else:
df = CompositionToOxidComposition(
max_sites=-1 if getattr(self, "continuous_only", False) else None
).featurize_dataframe(df, col_id=col_comp, ignore_errors=True)
).featurize_dataframe(
df,
col_id=col_comp,
ignore_errors=getattr(self, "ignore_errors", True),
)
df = self._fit_apply_featurizers(
df,
self.oxid_composition_featurizers,
Expand Down Expand Up @@ -311,7 +321,10 @@ def featurize_site(
fingerprint, stats=self.site_stats
)
df = site_stats_fingerprint.featurize_dataframe(
df, "Input data|structure", multiindex=False, ignore_errors=True
df,
"Input data|structure",
multiindex=False,
ignore_errors=getattr(self, "ignore_errors", True),
)

if aliases:
Expand Down
21 changes: 20 additions & 1 deletion modnet/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import pytest
from pathlib import Path
from modnet.preprocessing import CompositionContainer

from modnet.utils import get_hash_of_file
from pymatgen.core import Structure


_TEST_DATA_HASHES = {
Expand Down Expand Up @@ -41,7 +43,24 @@ def _load_moddata(filename):
# what it was when created
assert get_hash_of_file(data_file) == _TEST_DATA_HASHES[filename]

return MODData.load(data_file)
moddata = MODData.load(data_file)
# For forwards compatibility with pymatgen, we have to patch our old test data to have the following attributes
# to allow for depickling
# This is hopefully only a temporary solution, and in future, we should serialize pymatgen objects
# with Monty's `from_dict`/`to_dict` to avoid having to hack this private interface
for ind, s in enumerate(moddata.structures):
if isinstance(s, Structure):
# assume all previous data was periodic
moddata.structures[ind].lattice._pbc = [True, True, True]
for jnd, site in enumerate(s.sites):
# assume all of our previous data had ordered sites
moddata.structures[ind].sites[jnd].label = str(next(iter(site.species)))
# required for the global structure.is_ordered to work
moddata.structures[ind].sites[jnd].species._n_atoms = 1.0
elif isinstance(s, CompositionContainer):
moddata.structures[ind].composition._n_atoms = s.composition._natoms

return moddata


@pytest.fixture(scope="function")
Expand Down
23 changes: 7 additions & 16 deletions modnet/tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,14 @@ def check_column_values(new: MODData, reference: MODData, tolerance=0.03):
Allows for some columns to be checked more loosely (see inline comment below).

"""
new_cols = set(new.df_featurized.columns)
old_cols = set(reference.df_featurized.columns)

# Check that the new df only adds new columns and is not missing anything
assert not (old_cols - new_cols)

error_cols = set()
for col in new.df_featurized.columns:
for col in old_cols:
if not (
np.absolute(
(
Expand Down Expand Up @@ -349,14 +355,6 @@ def test_small_moddata_featurization(small_moddata_2023, featurizer_mode):
featurizer.featurizer_mode = featurizer_mode
new = MODData(structures, targets, target_names=names, featurizer=featurizer)
new.featurize(fast=False, n_jobs=1)

new_cols = sorted(new.df_featurized.columns.tolist())
old_cols = sorted(old.df_featurized.columns.tolist())

for i in range(len(old_cols)):
assert new_cols[i] == old_cols[i]

np.testing.assert_array_equal(old_cols, new_cols)
check_column_values(new, old, tolerance=0.03)


Expand All @@ -376,13 +374,6 @@ def test_small_moddata_composition_featurization(
new = MODData(materials=compositions, featurizer=featurizer)
new.featurize(fast=False, n_jobs=1)

new_cols = sorted(new.df_featurized.columns.tolist())
ref_cols = sorted(reference.df_featurized.columns.tolist())

for i in range(len(ref_cols)):
# print(new_cols[i], ref_cols[i])
assert new_cols[i] == ref_cols[i]

# assert relative error below 3 percent
check_column_values(new, reference, tolerance=0.03)

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ pandas==1.5.2
scikit-learn==1.3.2
matminer==0.9.2
numpy>=1.25
pymatgen==2023.11.12
pymatgen==2024.3.1
scikit-learn==1.3.2
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
packages=setuptools.find_packages(),
install_requires=[
"pandas~=1.5",
"tensorflow~=2.10",
"tensorflow~=2.10,<2.12",
"pymatgen>=2023",
"matminer~=0.9",
"numpy>=1.24",
Expand Down
Loading