-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
MG45 - ADT: Pathology data ETL (#149)
* Generalized the biomarkers transform so it can work for the pathology dataset as well * Updated test file names and functions to reflect biomarkers to modelAD_general_transform change * Added pathology dataset to the modelad_test_config.yaml * Updating function names and desctiptions to generalized transform name * Addressing PR comments --------- Co-authored-by: Beatriz Saldana <bsaldana@w262.lan>
- Loading branch information
1 parent
e02c29b
commit 105ddc9
Showing
18 changed files
with
158 additions
and
123 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
""" | ||
This module contains the transformation logic for the biomarkers and pathology datasets. | ||
This is for the Model AD project. | ||
""" | ||
|
||
import pandas as pd | ||
from typing import Dict, List | ||
|
||
|
||
def immunohisto_transform( | ||
datasets: Dict[str, pd.DataFrame], | ||
dataset_name: str, | ||
group_columns: List[str] = ["model", "type", "age_death", "tissue", "units"], | ||
extra_columns: List[str] = ["genotype", "measurement", "sex"], | ||
extra_column_name: str = "points", | ||
) -> pd.DataFrame: | ||
""" | ||
Takes a dictionary of dataset DataFrames, extracts the 'dataset_name' | ||
DataFrame, and transforms it into a DataFrame grouped by group_columns. | ||
Will include extra_columns in the group. | ||
Args: | ||
datasets (Dict[str, pd.DataFrame]): Dictionary of dataset names mapped to their DataFrame. | ||
dataset_name (str): The name of the dataset to transform. | ||
group_columns (List[str], optional): List of columns to group by. Defaults to ['model', 'type', 'age_death', 'tissue', 'units']. | ||
extra_columns (List[str], optional): List of columns to include in the group. Defaults to ['genotype', 'measurement', 'sex']. | ||
extra_column_name (str, optional): Name of the column containing the extra columns. Defaults to 'points'. | ||
Returns: | ||
pd.DataFrame: A DataFrame grouped by the group_columns. | ||
""" | ||
dataset = datasets[dataset_name] | ||
|
||
missing_columns = [ | ||
col for col in group_columns + extra_columns if col not in dataset.columns | ||
] | ||
if missing_columns: | ||
raise ValueError( | ||
f"{dataset_name} dataset missing columns: {', '.join(missing_columns)}" | ||
) | ||
|
||
dataset = dataset.fillna("none") | ||
data_rows = [] | ||
|
||
grouped = dataset.groupby(group_columns) | ||
|
||
for group_key, group in grouped: | ||
entry = dict(zip(group_columns, group_key)) | ||
entry[extra_column_name] = group[extra_columns].to_dict("records") | ||
data_rows.append(entry) | ||
|
||
return pd.DataFrame(data_rows) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import os | ||
|
||
import pandas as pd | ||
import pytest | ||
|
||
from agoradatatools.etl.transform.immunohisto_transform import ( | ||
immunohisto_transform, | ||
) | ||
|
||
|
||
class TestTransformGeneralModelAD: | ||
data_files_path = "tests/test_assets/immunohisto_transform" | ||
pass_test_data = [ | ||
( | ||
# Pass with good fake data | ||
"immunohisto_transform_good_test_input.csv", | ||
"immunohisto_transform_good_test_output.json", | ||
), | ||
( | ||
# Pass with duplicated data | ||
"immunohisto_transform_duplicated_input.csv", | ||
"immunohisto_transform_duplicated_output.json", | ||
), | ||
( | ||
# Pass with none data | ||
"immunohisto_transform_none_input.csv", | ||
"immunohisto_transform_none_output.json", | ||
), | ||
( | ||
# Pass with missing data | ||
"immunohisto_transform_missing_input.csv", | ||
"immunohisto_transform_missing_output.json", | ||
), | ||
( | ||
# Pass with extra column | ||
"immunohisto_transform_extra_column.csv", | ||
"immunohisto_transform_extra_column_output.json", | ||
), | ||
] | ||
pass_test_ids = [ | ||
"Pass with good fake data", | ||
"Pass with duplicated data", | ||
"Pass with none data", | ||
"Pass with missing data", | ||
"Pass with extra column", | ||
] | ||
fail_test_data = [("immunohisto_transform_missing_column.csv")] | ||
fail_test_ids = [("Fail with missing column")] | ||
|
||
@pytest.mark.parametrize( | ||
"immunohisto_transform_file, expected_output_file", | ||
pass_test_data, | ||
ids=pass_test_ids, | ||
) | ||
def test_immunohisto_transform_should_pass( | ||
self, immunohisto_transform_file, expected_output_file | ||
): | ||
immunohisto_transform_df = pd.read_csv( | ||
os.path.join(self.data_files_path, "input", immunohisto_transform_file) | ||
) | ||
output_df = pd.DataFrame( | ||
immunohisto_transform( | ||
datasets={"immunohisto_transform": immunohisto_transform_df}, | ||
dataset_name="immunohisto_transform", | ||
) | ||
) | ||
expected_df = pd.read_json( | ||
os.path.join(self.data_files_path, "output", expected_output_file), | ||
) | ||
pd.testing.assert_frame_equal(output_df, expected_df) | ||
|
||
@pytest.mark.parametrize( | ||
"immunohisto_transform_file", fail_test_data, ids=fail_test_ids | ||
) | ||
def test_immunohisto_transform_should_fail( | ||
self, immunohisto_transform_file, error_type: BaseException = ValueError | ||
): | ||
immunohisto_transform_df = pd.read_csv( | ||
os.path.join(self.data_files_path, "input", immunohisto_transform_file) | ||
) | ||
with pytest.raises(error_type): | ||
immunohisto_transform( | ||
datasets={"immunohisto_transform": immunohisto_transform_df}, | ||
dataset_name="immunohisto_transform", | ||
) |