Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SCHEMATIC-1] BugFix: manifest submission when data previously annotated without manifest upload #1538

Merged
merged 9 commits into from
Nov 13, 2024
8 changes: 7 additions & 1 deletion schematic/store/synapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2825,7 +2825,7 @@ def getDatasetAnnotations(
try:
logger.info("Trying batch mode for retrieving Synapse annotations")
table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids)
except (SynapseAuthenticationError, SynapseHTTPError):
except (SynapseAuthenticationError, SynapseHTTPError, ValueError):
thomasyu888 marked this conversation as resolved.
Show resolved Hide resolved
logger.info(
f"Unable to create a temporary file view bound to {datasetId}. "
"Defaulting to slower iterative retrieval of annotations."
Expand Down Expand Up @@ -3508,6 +3508,12 @@ def _fix_default_columns(self):
# Rename ROW_ETAG column to eTag and place at end of data frame
if "ROW_ETAG" in self.table:
row_etags = self.table.pop("ROW_ETAG")

# eTag column may already present if users annotated data without submitting manifest
# we're only concerned with the new values and not the existing ones
if "eTag" in self.table:
del self.table["eTag"]

self.table.insert(len(self.table.columns), "eTag", row_etags)

return self.table
Expand Down
66 changes: 66 additions & 0 deletions tests/test_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import pandas as pd
import pytest
from numpy import nan
from pandas.testing import assert_frame_equal
from synapseclient import EntityViewSchema, Folder
from synapseclient.core.exceptions import SynapseHTTPError
Expand Down Expand Up @@ -991,6 +992,71 @@ def test_tidy_table(self, dataset_fileview_table_tidy):
assert isinstance(year_value, str)
assert year_value == "1980"

def test_tidy_table_no_manifest_uploaded(self, synapse_store):
"""
Test to ensure that the table can be tidied without issue when a DatasetFileView object is instantiated
based on a dataset that has files annotated but no manifest uploaded.
Covers the case where a user validates a manifest with schematic, and annotates the files with a non-schematic tool (ie the R client),
and then tries to generate a manifest for the dataset with schematic.
"""
# GIVEN a dataset that has files annotated (including the eTag annotation) but no manifest uplodaded
dataset_id = "syn64019998"

# AND the expected metadata from the files in the dataset
expected_metadata = pd.DataFrame(
{
"Component": {
0: nan,
1: "BulkRNA-seqAssay",
2: "BulkRNA-seqAssay",
3: "BulkRNA-seqAssay",
4: "BulkRNA-seqAssay",
},
"FileFormat": {0: nan, 1: "BAM", 2: "BAM", 3: "BAM", 4: "BAM"},
"GenomeBuild": {
0: nan,
1: "GRCh37",
2: "GRCh37",
3: "GRCh37",
4: "GRCh37",
},
"entityId": {
0: "syn64019999",
1: "syn64020000",
2: "syn64020001",
3: "syn64020002",
4: "syn64020003",
},
},
).set_index("entityId", drop=False)

# WHEN a DatasetFileView object is instantiated based on the dataset
dataset_fileview = DatasetFileView(dataset_id, synapse_store.syn)

# AND the fileview is queried without being tidied
table = dataset_fileview.query(tidy=False, force=True)

# THEN a table should be present
assert isinstance(table, pd.DataFrame)

# AND the table should not be empty
assert not table.empty

# AND the table should already include the eTag column that will be removed and saved for comparison later
assert "eTag" in table.columns
original_etag_colum = table.pop("eTag")

# AND when the table is tidied no exception should be raised
with does_not_raise():
table = dataset_fileview.tidy_table()

# AND the eTag column should be different from the original eTag column
new_etag_column = table.pop("eTag").reset_index(drop=True)
assert (new_etag_column != original_etag_colum).all()

# AND the expected metadata should be present in the table
assert_frame_equal(table, expected_metadata)


@pytest.mark.table_operations
class TestTableOperations:
Expand Down