Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SCHEMATIC-214] Wrap pandas functions to support not including None with the NA values argument #1553

Merged
merged 27 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
af4683c
Wrap pandas functions to support not including `None` with the NA val…
BryanFauble Nov 22, 2024
21421df
Ignore types
BryanFauble Nov 22, 2024
0b221ce
pylint issues
BryanFauble Nov 22, 2024
5af786d
ordering of ignore
BryanFauble Nov 22, 2024
dcb20b1
Add to integration test to cover none in a manifest
BryanFauble Nov 25, 2024
f43cd28
Merge branch 'develop' into schematic-210-pandas-remove-none
BryanFauble Nov 25, 2024
6eeacd5
Add additional test for manifest
BryanFauble Dec 3, 2024
28f810e
[SCHEMATIC-210] Add attribute to nones data model (#1555)
GiaJordan Dec 9, 2024
e129361
first commit
andrewelamb Dec 10, 2024
d443b31
Merge branch 'schematic-210-pandas-remove-none' of https://github.com…
andrewelamb Dec 10, 2024
7a3bb1c
ran black
andrewelamb Dec 10, 2024
9831cc2
Merge branch 'develop' into schematic-210-pandas-remove-none
thomasyu888 Dec 10, 2024
41afbc3
add test for validateModelManifest
andrewelamb Dec 11, 2024
929275f
[SCHEMATIC-214] change data model and component (#1556)
GiaJordan Dec 11, 2024
ab364f1
Merge branch 'schematic-210-pandas-remove-none' of https://github.com…
andrewelamb Dec 12, 2024
2c67d1e
change test to use new manifests
andrewelamb Dec 12, 2024
d53d988
remove uneeded test file
andrewelamb Dec 12, 2024
39c11ab
revert file
andrewelamb Dec 12, 2024
b5960a4
revert file
andrewelamb Dec 12, 2024
a291b32
change tests to use new manifests
andrewelamb Dec 12, 2024
1708262
remove uneeded manifests
andrewelamb Dec 12, 2024
2cbcae4
ran black
andrewelamb Dec 12, 2024
b89c597
add tests back in
andrewelamb Dec 12, 2024
ae6f658
ran black
andrewelamb Dec 12, 2024
6ffa9d6
revert manifest
andrewelamb Dec 12, 2024
d0a8e15
Split up valid and errored test as separate testing functions
thomasyu888 Dec 13, 2024
7310340
Remove unused import
thomasyu888 Dec 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion schematic/models/validate_attribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from schematic.schemas.data_model_graph import DataModelGraphExplorer
from schematic.store.synapse import SynapseStorage
from schematic.utils.df_utils import read_csv
from schematic.utils.validate_rules_utils import validation_rule_info
from schematic.utils.validate_utils import (
comma_separated_list_regex,
Expand Down Expand Up @@ -868,7 +869,7 @@ def _get_target_manifest_dataframes(
entity: File = self.synStore.getDatasetManifest(
datasetId=dataset_id, downloadFile=True
)
manifests.append(pd.read_csv(entity.path))
manifests.append(read_csv(entity.path))
andrewelamb marked this conversation as resolved.
Show resolved Hide resolved
return dict(zip(manifest_ids, manifests))

def get_target_manifests(
Expand Down
3 changes: 2 additions & 1 deletion schematic/store/database/synapse_database_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from opentelemetry import trace

from schematic.store.synapse_tracker import SynapseEntityTracker
from schematic.utils.df_utils import read_csv


class SynapseTableNameError(Exception):
Expand Down Expand Up @@ -108,7 +109,7 @@ def execute_sql_query(
pandas.DataFrame: The queried table
"""
result = self.execute_sql_statement(query, include_row_data)
table = pandas.read_csv(result.filepath)
table = read_csv(result.filepath)
return table

def execute_sql_statement(
Expand Down
21 changes: 17 additions & 4 deletions schematic/store/synapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,12 @@
from schematic.store.base import BaseStorage
from schematic.store.database.synapse_database import SynapseDatabase
from schematic.store.synapse_tracker import SynapseEntityTracker
from schematic.utils.df_utils import col_in_dataframe, load_df, update_df
from schematic.utils.df_utils import (
STR_NA_VALUES_FILTERED,
col_in_dataframe,
load_df,
update_df,
)

# entity_type_mapping, get_dir_size, create_temp_folder, check_synapse_cache_size, and clear_synapse_cache functions are used for AWS deployment
# Please do not remove these import statements
Expand Down Expand Up @@ -399,7 +404,7 @@ def query_fileview(
try:
self.storageFileviewTable = self.syn.tableQuery(
query=self.fileview_query,
).asDataFrame()
).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
except SynapseHTTPError as exc:
exception_text = str(exc)
if "Unknown column path" in exception_text:
Expand Down Expand Up @@ -1418,7 +1423,11 @@ def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable
"""

results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id))
df = results.asDataFrame(rowIdAndVersionInIndex=False)
df = results.asDataFrame(
rowIdAndVersionInIndex=False,
na_values=STR_NA_VALUES_FILTERED,
keep_default_na=False,
)

return df, results

Expand Down Expand Up @@ -3470,7 +3479,11 @@ def query(self, tidy=True, force=False):
if self.table is None or force:
fileview_id = self.view_schema["id"]
self.results = self.synapse.tableQuery(f"select * from {fileview_id}")
self.table = self.results.asDataFrame(rowIdAndVersionInIndex=False)
self.table = self.results.asDataFrame(
rowIdAndVersionInIndex=False,
na_values=STR_NA_VALUES_FILTERED,
keep_default_na=False,
)
if tidy:
self.tidy_table()
return self.table
Expand Down
51 changes: 45 additions & 6 deletions schematic/utils/df_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,58 @@

import logging
from copy import deepcopy
from time import perf_counter
from typing import Union, Any, Optional
from datetime import datetime
from time import perf_counter
from typing import Any, Optional, Union

import dateparser as dp
import pandas as pd
import numpy as np
import pandas as pd
from pandarallel import pandarallel # type: ignore

# pylint:disable=no-name-in-module
from pandas._libs.parsers import STR_NA_VALUES # type: ignore

STR_NA_VALUES_FILTERED = deepcopy(STR_NA_VALUES)

try:
andrewelamb marked this conversation as resolved.
Show resolved Hide resolved
STR_NA_VALUES_FILTERED.remove("None")
except KeyError:
pass

logger = logging.getLogger(__name__)


def read_csv(
andrewelamb marked this conversation as resolved.
Show resolved Hide resolved
path_or_buffer: str,
keep_default_na: bool = False,
encoding: str = "utf8",
**load_args: Any,
) -> pd.DataFrame:
"""
A wrapper around pd.read_csv that filters out "None" from the na_values list.

Args:
path_or_buffer: The path to the file or a buffer containing the file.
keep_default_na: Whether to keep the default na_values list.
encoding: The encoding of the file.
**load_args: Additional arguments to pass to pd.read_csv.

Returns:
pd.DataFrame: The dataframe created from the CSV file or buffer.
"""
na_values = load_args.pop(
"na_values", STR_NA_VALUES_FILTERED if not keep_default_na else None
)
return pd.read_csv( # type: ignore
path_or_buffer,
na_values=na_values,
keep_default_na=keep_default_na,
encoding=encoding,
**load_args,
)


def load_df(
file_path: str,
preserve_raw_input: bool = True,
Expand Down Expand Up @@ -45,9 +86,7 @@ def load_df(
t_load_df = perf_counter()

# Read CSV to df as type specified in kwargs
org_df = pd.read_csv( # type: ignore
file_path, keep_default_na=True, encoding="utf8", **load_args
)
org_df = read_csv(file_path, encoding="utf8", **load_args) # type: ignore
if not isinstance(org_df, pd.DataFrame):
raise ValueError(
(
Expand Down
3 changes: 2 additions & 1 deletion schematic_api/api/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
from schematic.schemas.data_model_parser import DataModelParser
from schematic.store.synapse import ManifestDownload, SynapseStorage
from schematic.utils.df_utils import read_csv
from schematic.utils.general import create_temp_folder, entity_type_mapping
from schematic.utils.schema_utils import (
DisplayLabelType,
Expand Down Expand Up @@ -178,7 +179,7 @@ def parse_bool(str_bool):


def return_as_json(manifest_local_file_path):
manifest_csv = pd.read_csv(manifest_local_file_path)
manifest_csv = read_csv(manifest_local_file_path)
manifest_json = manifest_csv.to_dict(orient="records")
return manifest_json

Expand Down
6 changes: 3 additions & 3 deletions tests/data/example.model.csv
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Cancer Type,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,,
Family History,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,,list strict
Biospecimen,,,"Sample ID, Patient ID, Tissue Status, Component",,FALSE,DataType,Patient,,
Sample ID,,,,,TRUE,DataProperty,,,
Tissue Status,,"Healthy, Malignant",,,TRUE,DataProperty,,,
Tissue Status,,"Healthy, Malignant, None",,,TRUE,DataProperty,,,
Bulk RNA-seq Assay,,,"Filename, Sample ID, File Format, Component",,FALSE,DataType,Biospecimen,,
Filename,,,,,TRUE,DataProperty,,,#MockFilename filenameExists^^
File Format,,"FASTQ, BAM, CRAM, CSV/TSV",,,TRUE,DataProperty,,,
Expand All @@ -24,8 +24,8 @@ Check List,,,,,TRUE,DataProperty,,,list
Check List Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list
Check List Like,,,,,TRUE,DataProperty,,,list like
Check List Like Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list like
Check List Strict,,,,,TRUE,DataProperty,,,list strict
Check List Enum Strict,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict
Check List Strict,,,,,TRUE,DataProperty,,,list strict
Check List Enum Strict,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict
Check Regex List,,,,,TRUE,DataProperty,,,list::regex match [a-f]
Check Regex List Strict,,,,,TRUE,DataProperty,,,list strict::regex match [a-f]
Check Regex List Like,,,,,TRUE,DataProperty,,,list like::regex match [a-f]
Expand Down
20 changes: 20 additions & 0 deletions tests/data/example.model.jsonld
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,9 @@
},
{
"@id": "bts:Malignant"
},
{
"@id": "bts:None"
}
],
"sms:displayName": "Tissue Status",
Expand All @@ -563,6 +566,23 @@
"sms:required": "sms:false",
"sms:validationRules": []
},
{
"@id": "bts:None",
"@type": "rdfs:Class",
"rdfs:comment": "TBD",
"rdfs:label": "None",
"rdfs:subClassOf": [
{
"@id": "bts:TissueStatus"
}
],
"schema:isPartOf": {
"@id": "http://schema.biothings.io"
},
"sms:displayName": "None",
"sms:required": "sms:false",
"sms:validationRules": []
},
{
"@id": "bts:BulkRNA-seqAssay",
"@type": "rdfs:Class",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Sample ID,Patient ID,Tissue Status,Component
1,1,Healthy,Biospecimen
2,2,Malignant,Biospecimen
3,3,None,Biospecimen
4,4,None,Biospecimen
5,5,InvalidValue,Biospecimen
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Sample ID,Patient ID,Tissue Status,Component
1,1,Healthy,Biospecimen
2,2,Malignant,Biospecimen
3,3,None,Biospecimen
4,4,None,Biospecimen
5,5,None,Biospecimen
8 changes: 4 additions & 4 deletions tests/integration/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from io import BytesIO

import numpy as np
import pandas as pd
import pytest
import requests
from click.testing import CliRunner
Expand All @@ -14,6 +13,7 @@
from schematic.configuration.configuration import CONFIG, Configuration
from schematic.manifest.commands import manifest
from schematic.models.commands import model
from schematic.utils.df_utils import read_csv
from tests.conftest import ConfigurationForTesting

LIGHT_BLUE = "FFEAF7F9" # Required cell
Expand Down Expand Up @@ -155,8 +155,8 @@ def test_generate_empty_csv_manifests(self, runner: CliRunner) -> None:
# command has no (python) errors, has exit code 0
assert result.exit_code == 0

biospecimen_df = pd.read_csv("tests/data/example.Biospecimen.manifest.csv")
patient_df = pd.read_csv("tests/data/example.Patient.manifest.csv")
biospecimen_df = read_csv("tests/data/example.Biospecimen.manifest.csv")
patient_df = read_csv("tests/data/example.Patient.manifest.csv")

# Remove created files:
finally:
Expand Down Expand Up @@ -339,7 +339,7 @@ def test_generate_empty_google_sheet_manifests(
assert False, f"Unexpected data validation found: {dv}"
assert tissue_status_validation is not None
assert tissue_status_validation.type == "list"
assert tissue_status_validation.formula1 == "Sheet2!$C$2:$C$3"
assert tissue_status_validation.formula1 == "Sheet2!$C$2:$C$4"

# required fields are marked as “light blue”, while other non-required fields are marked as white.
for col in ["Sample ID", "Patient ID", "Tissue Status", "Component"]:
Expand Down
12 changes: 10 additions & 2 deletions tests/integration/test_manifest_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
import uuid
from typing import Any, Callable, Dict

import pandas as pd
import pytest
import requests
from flask.testing import FlaskClient
from synapseclient.client import Synapse

from schematic.configuration.configuration import CONFIG
from schematic.store.synapse import SynapseStorage
from schematic.utils.df_utils import read_csv
from tests.conftest import ConfigurationForTesting, Helpers
from tests.utils import CleanupItem

Expand Down Expand Up @@ -73,7 +73,7 @@ def validate_submitted_manifest_file(
manifest_file_path = os.path.join(
download_location, manifest_data["properties"]["name"]
)
manifest_submitted_df = pd.read_csv(manifest_file_path)
manifest_submitted_df = read_csv(manifest_file_path)
assert "entityId" in manifest_submitted_df.columns
assert "Id" in manifest_submitted_df.columns

Expand Down Expand Up @@ -1035,6 +1035,11 @@ def test_submit_manifest_with_hide_blanks(
randomized_annotation_content = str(uuid.uuid4())
df["RandomizedAnnotation"] = randomized_annotation_content

# AND a "None" string remains in the manifest
df["NoneString"] = "None"
df["NoneString1"] = "none"
df["NoneString2"] = "NoNe"

with tempfile.NamedTemporaryFile(delete=True, suffix=".csv") as tmp_file:
# Write the DF to a temporary file
df.to_csv(tmp_file.name, index=False)
Expand Down Expand Up @@ -1070,6 +1075,9 @@ def test_submit_manifest_with_hide_blanks(
modified_file = syn.get(df["entityId"][0], downloadFile=False)
assert modified_file is not None
assert modified_file["RandomizedAnnotation"][0] == randomized_annotation_content
assert modified_file["NoneString"][0] == "None"
assert modified_file["NoneString1"][0] == "none"
assert modified_file["NoneString2"][0] == "NoNe"

# AND the blank annotations are not present
assert "Genome Build" not in modified_file
Expand Down
Loading
Loading