Skip to content

Commit

Permalink
[SCHEMATIC-214] Wrap pandas functions to support not including None
Browse files Browse the repository at this point in the history
… with the NA values argument (#1553)

* Wrap pandas functions to support not including `None` with the NA values argument

* Ignore types

* pylint issues

* ordering of ignore

* Add to integration test to cover none in a manifest

* Add additional test for manifest

* [SCHEMATIC-210] Add attribute to nones data model (#1555)

Update example_test_nones.model.csv component and add new invalid manifest with nones

* first commit

* ran black

* add test for validateModelManifest

* [SCHEMATIC-214] change data model and component (#1556)

* add valid values to Patient attributes

* update data model

* add test manifests

* update test for new model

* update test for new valid value

* change test to use new manifests

* remove uneeded test file

* revert file

* revert file

* change tests to use new manifests

* remove uneeded manifests

* ran black

* add tests back in

* ran black

* revert manifest

* Split up valid and errored test as separate testing functions

* Remove unused import

---------

Co-authored-by: Gianna Jordan <61707471+GiaJordan@users.noreply.github.com>
Co-authored-by: Andrew Lamb <andrewelamb@gmail.com>
Co-authored-by: Thomas Yu <thomas.yu@sagebase.org>
  • Loading branch information
4 people authored Dec 13, 2024
1 parent ef87c39 commit 70813f1
Show file tree
Hide file tree
Showing 19 changed files with 248 additions and 45 deletions.
3 changes: 2 additions & 1 deletion schematic/models/validate_attribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from schematic.schemas.data_model_graph import DataModelGraphExplorer
from schematic.store.synapse import SynapseStorage
from schematic.utils.df_utils import read_csv
from schematic.utils.validate_rules_utils import validation_rule_info
from schematic.utils.validate_utils import (
comma_separated_list_regex,
Expand Down Expand Up @@ -868,7 +869,7 @@ def _get_target_manifest_dataframes(
entity: File = self.synStore.getDatasetManifest(
datasetId=dataset_id, downloadFile=True
)
manifests.append(pd.read_csv(entity.path))
manifests.append(read_csv(entity.path))
return dict(zip(manifest_ids, manifests))

def get_target_manifests(
Expand Down
3 changes: 2 additions & 1 deletion schematic/store/database/synapse_database_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from opentelemetry import trace

from schematic.store.synapse_tracker import SynapseEntityTracker
from schematic.utils.df_utils import read_csv


class SynapseTableNameError(Exception):
Expand Down Expand Up @@ -108,7 +109,7 @@ def execute_sql_query(
pandas.DataFrame: The queried table
"""
result = self.execute_sql_statement(query, include_row_data)
table = pandas.read_csv(result.filepath)
table = read_csv(result.filepath)
return table

def execute_sql_statement(
Expand Down
21 changes: 17 additions & 4 deletions schematic/store/synapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,12 @@
from schematic.store.base import BaseStorage
from schematic.store.database.synapse_database import SynapseDatabase
from schematic.store.synapse_tracker import SynapseEntityTracker
from schematic.utils.df_utils import col_in_dataframe, load_df, update_df
from schematic.utils.df_utils import (
STR_NA_VALUES_FILTERED,
col_in_dataframe,
load_df,
update_df,
)

# entity_type_mapping, get_dir_size, create_temp_folder, check_synapse_cache_size, and clear_synapse_cache functions are used for AWS deployment
# Please do not remove these import statements
Expand Down Expand Up @@ -399,7 +404,7 @@ def query_fileview(
try:
self.storageFileviewTable = self.syn.tableQuery(
query=self.fileview_query,
).asDataFrame()
).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
except SynapseHTTPError as exc:
exception_text = str(exc)
if "Unknown column path" in exception_text:
Expand Down Expand Up @@ -1418,7 +1423,11 @@ def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable
"""

results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id))
df = results.asDataFrame(rowIdAndVersionInIndex=False)
df = results.asDataFrame(
rowIdAndVersionInIndex=False,
na_values=STR_NA_VALUES_FILTERED,
keep_default_na=False,
)

return df, results

Expand Down Expand Up @@ -3470,7 +3479,11 @@ def query(self, tidy=True, force=False):
if self.table is None or force:
fileview_id = self.view_schema["id"]
self.results = self.synapse.tableQuery(f"select * from {fileview_id}")
self.table = self.results.asDataFrame(rowIdAndVersionInIndex=False)
self.table = self.results.asDataFrame(
rowIdAndVersionInIndex=False,
na_values=STR_NA_VALUES_FILTERED,
keep_default_na=False,
)
if tidy:
self.tidy_table()
return self.table
Expand Down
51 changes: 45 additions & 6 deletions schematic/utils/df_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,58 @@

import logging
from copy import deepcopy
from time import perf_counter
from typing import Union, Any, Optional
from datetime import datetime
from time import perf_counter
from typing import Any, Optional, Union

import dateparser as dp
import pandas as pd
import numpy as np
import pandas as pd
from pandarallel import pandarallel # type: ignore

# pylint:disable=no-name-in-module
from pandas._libs.parsers import STR_NA_VALUES # type: ignore

STR_NA_VALUES_FILTERED = deepcopy(STR_NA_VALUES)

try:
STR_NA_VALUES_FILTERED.remove("None")
except KeyError:
pass

logger = logging.getLogger(__name__)


def read_csv(
path_or_buffer: str,
keep_default_na: bool = False,
encoding: str = "utf8",
**load_args: Any,
) -> pd.DataFrame:
"""
A wrapper around pd.read_csv that filters out "None" from the na_values list.
Args:
path_or_buffer: The path to the file or a buffer containing the file.
keep_default_na: Whether to keep the default na_values list.
encoding: The encoding of the file.
**load_args: Additional arguments to pass to pd.read_csv.
Returns:
pd.DataFrame: The dataframe created from the CSV file or buffer.
"""
na_values = load_args.pop(
"na_values", STR_NA_VALUES_FILTERED if not keep_default_na else None
)
return pd.read_csv( # type: ignore
path_or_buffer,
na_values=na_values,
keep_default_na=keep_default_na,
encoding=encoding,
**load_args,
)


def load_df(
file_path: str,
preserve_raw_input: bool = True,
Expand Down Expand Up @@ -45,9 +86,7 @@ def load_df(
t_load_df = perf_counter()

# Read CSV to df as type specified in kwargs
org_df = pd.read_csv( # type: ignore
file_path, keep_default_na=True, encoding="utf8", **load_args
)
org_df = read_csv(file_path, encoding="utf8", **load_args) # type: ignore
if not isinstance(org_df, pd.DataFrame):
raise ValueError(
(
Expand Down
3 changes: 2 additions & 1 deletion schematic_api/api/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
from schematic.schemas.data_model_parser import DataModelParser
from schematic.store.synapse import ManifestDownload, SynapseStorage
from schematic.utils.df_utils import read_csv
from schematic.utils.general import create_temp_folder, entity_type_mapping
from schematic.utils.schema_utils import (
DisplayLabelType,
Expand Down Expand Up @@ -178,7 +179,7 @@ def parse_bool(str_bool):


def return_as_json(manifest_local_file_path):
manifest_csv = pd.read_csv(manifest_local_file_path)
manifest_csv = read_csv(manifest_local_file_path)
manifest_json = manifest_csv.to_dict(orient="records")
return manifest_json

Expand Down
6 changes: 3 additions & 3 deletions tests/data/example.model.csv
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Cancer Type,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,,
Family History,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,,list strict
Biospecimen,,,"Sample ID, Patient ID, Tissue Status, Component",,FALSE,DataType,Patient,,
Sample ID,,,,,TRUE,DataProperty,,,
Tissue Status,,"Healthy, Malignant",,,TRUE,DataProperty,,,
Tissue Status,,"Healthy, Malignant, None",,,TRUE,DataProperty,,,
Bulk RNA-seq Assay,,,"Filename, Sample ID, File Format, Component",,FALSE,DataType,Biospecimen,,
Filename,,,,,TRUE,DataProperty,,,#MockFilename filenameExists^^
File Format,,"FASTQ, BAM, CRAM, CSV/TSV",,,TRUE,DataProperty,,,
Expand All @@ -24,8 +24,8 @@ Check List,,,,,TRUE,DataProperty,,,list
Check List Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list
Check List Like,,,,,TRUE,DataProperty,,,list like
Check List Like Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list like
Check List Strict,,,,,TRUE,DataProperty,,,list strict
Check List Enum Strict,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict
Check List Strict,,,,,TRUE,DataProperty,,,list strict
Check List Enum Strict,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict
Check Regex List,,,,,TRUE,DataProperty,,,list::regex match [a-f]
Check Regex List Strict,,,,,TRUE,DataProperty,,,list strict::regex match [a-f]
Check Regex List Like,,,,,TRUE,DataProperty,,,list like::regex match [a-f]
Expand Down
20 changes: 20 additions & 0 deletions tests/data/example.model.jsonld
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,9 @@
},
{
"@id": "bts:Malignant"
},
{
"@id": "bts:None"
}
],
"sms:displayName": "Tissue Status",
Expand All @@ -563,6 +566,23 @@
"sms:required": "sms:false",
"sms:validationRules": []
},
{
"@id": "bts:None",
"@type": "rdfs:Class",
"rdfs:comment": "TBD",
"rdfs:label": "None",
"rdfs:subClassOf": [
{
"@id": "bts:TissueStatus"
}
],
"schema:isPartOf": {
"@id": "http://schema.biothings.io"
},
"sms:displayName": "None",
"sms:required": "sms:false",
"sms:validationRules": []
},
{
"@id": "bts:BulkRNA-seqAssay",
"@type": "rdfs:Class",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Sample ID,Patient ID,Tissue Status,Component
1,1,Healthy,Biospecimen
2,2,Malignant,Biospecimen
3,3,None,Biospecimen
4,4,None,Biospecimen
5,5,InvalidValue,Biospecimen
6 changes: 6 additions & 0 deletions tests/data/mock_manifests/Valid_none_value_test_manifest.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Sample ID,Patient ID,Tissue Status,Component
1,1,Healthy,Biospecimen
2,2,Malignant,Biospecimen
3,3,None,Biospecimen
4,4,None,Biospecimen
5,5,None,Biospecimen
8 changes: 4 additions & 4 deletions tests/integration/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from io import BytesIO

import numpy as np
import pandas as pd
import pytest
import requests
from click.testing import CliRunner
Expand All @@ -14,6 +13,7 @@
from schematic.configuration.configuration import CONFIG, Configuration
from schematic.manifest.commands import manifest
from schematic.models.commands import model
from schematic.utils.df_utils import read_csv
from tests.conftest import ConfigurationForTesting

LIGHT_BLUE = "FFEAF7F9" # Required cell
Expand Down Expand Up @@ -155,8 +155,8 @@ def test_generate_empty_csv_manifests(self, runner: CliRunner) -> None:
# command has no (python) errors, has exit code 0
assert result.exit_code == 0

biospecimen_df = pd.read_csv("tests/data/example.Biospecimen.manifest.csv")
patient_df = pd.read_csv("tests/data/example.Patient.manifest.csv")
biospecimen_df = read_csv("tests/data/example.Biospecimen.manifest.csv")
patient_df = read_csv("tests/data/example.Patient.manifest.csv")

# Remove created files:
finally:
Expand Down Expand Up @@ -339,7 +339,7 @@ def test_generate_empty_google_sheet_manifests(
assert False, f"Unexpected data validation found: {dv}"
assert tissue_status_validation is not None
assert tissue_status_validation.type == "list"
assert tissue_status_validation.formula1 == "Sheet2!$C$2:$C$3"
assert tissue_status_validation.formula1 == "Sheet2!$C$2:$C$4"

# required fields are marked as “light blue”, while other non-required fields are marked as white.
for col in ["Sample ID", "Patient ID", "Tissue Status", "Component"]:
Expand Down
12 changes: 10 additions & 2 deletions tests/integration/test_manifest_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
import uuid
from typing import Any, Callable, Dict

import pandas as pd
import pytest
import requests
from flask.testing import FlaskClient
from synapseclient.client import Synapse

from schematic.configuration.configuration import CONFIG
from schematic.store.synapse import SynapseStorage
from schematic.utils.df_utils import read_csv
from tests.conftest import ConfigurationForTesting, Helpers
from tests.utils import CleanupItem

Expand Down Expand Up @@ -73,7 +73,7 @@ def validate_submitted_manifest_file(
manifest_file_path = os.path.join(
download_location, manifest_data["properties"]["name"]
)
manifest_submitted_df = pd.read_csv(manifest_file_path)
manifest_submitted_df = read_csv(manifest_file_path)
assert "entityId" in manifest_submitted_df.columns
assert "Id" in manifest_submitted_df.columns

Expand Down Expand Up @@ -1035,6 +1035,11 @@ def test_submit_manifest_with_hide_blanks(
randomized_annotation_content = str(uuid.uuid4())
df["RandomizedAnnotation"] = randomized_annotation_content

# AND a "None" string remains in the manifest
df["NoneString"] = "None"
df["NoneString1"] = "none"
df["NoneString2"] = "NoNe"

with tempfile.NamedTemporaryFile(delete=True, suffix=".csv") as tmp_file:
# Write the DF to a temporary file
df.to_csv(tmp_file.name, index=False)
Expand Down Expand Up @@ -1070,6 +1075,9 @@ def test_submit_manifest_with_hide_blanks(
modified_file = syn.get(df["entityId"][0], downloadFile=False)
assert modified_file is not None
assert modified_file["RandomizedAnnotation"][0] == randomized_annotation_content
assert modified_file["NoneString"][0] == "None"
assert modified_file["NoneString1"][0] == "none"
assert modified_file["NoneString2"][0] == "NoNe"

# AND the blank annotations are not present
assert "Genome Build" not in modified_file
Expand Down
Loading

0 comments on commit 70813f1

Please sign in to comment.