From f3d878dc568ffc08ade5d9452f86db39c1c8de0d Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Mon, 11 Nov 2024 14:42:14 -0700 Subject: [PATCH 1/9] add test for bug case --- tests/test_store.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/tests/test_store.py b/tests/test_store.py index af00ba301..db6d4343e 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -15,7 +15,7 @@ import pandas as pd import pytest -from pandas.testing import assert_frame_equal +from pandas.testing import assert_frame_equal, assert_series_equal from synapseclient import EntityViewSchema, Folder from synapseclient.core.exceptions import SynapseHTTPError from synapseclient.entity import File, Project @@ -991,6 +991,34 @@ def test_tidy_table(self, dataset_fileview_table_tidy): assert isinstance(year_value, str) assert year_value == "1980" + def test_tidy_no_manifest_uploaded(self, synapse_store): + """ + Test to ensure that the table can be tidied without issue when a DatasetFileView object is instantiated + based on a dataset that has files annotated but no manifest uploaded. + Covers the case where a user validates a manifest with schematic, and annotates the files with a non-schematic tool (ie the R client), + and then tries to generate a manifest for the dataset with schematic. + """ + # GIVEN a dataset that has files annotated but no manifest uplodaded + dataset_id = "syn64019998" + # WHEN a DatasetFileView object is instantiated based on the dataset + dataset_fileview = DatasetFileView(dataset_id, synapse_store.syn) + # AND the fileview is queried + table = dataset_fileview.query(tidy=False, force=True) + # THEN a table should be present + assert isinstance(table, pd.DataFrame) + # AND the table should not be empty + assert not table.empty + # AND the table should already include the eTag column + assert "eTag" in table.columns + original_etag_colum = table["eTag"] + # AND the table should be able to be tidied without an exception being raised + with does_not_raise(): + table = dataset_fileview.tidy_table() + # AND the expected metadata should be present in the table + + # AND the eTag column should be different from the original eTag column + assert (table["eTag"] != original_etag_colum).all() + @pytest.mark.table_operations class TestTableOperations: From 7dcc88f994cced6d0baa5c66bca92bf7a531b1c3 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Mon, 11 Nov 2024 15:33:49 -0700 Subject: [PATCH 2/9] update test for table tidyness --- tests/test_store.py | 54 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 8 deletions(-) diff --git a/tests/test_store.py b/tests/test_store.py index db6d4343e..9e1e98b13 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -15,6 +15,7 @@ import pandas as pd import pytest +from numpy import nan from pandas.testing import assert_frame_equal, assert_series_equal from synapseclient import EntityViewSchema, Folder from synapseclient.core.exceptions import SynapseHTTPError @@ -991,33 +992,70 @@ def test_tidy_table(self, dataset_fileview_table_tidy): assert isinstance(year_value, str) assert year_value == "1980" - def test_tidy_no_manifest_uploaded(self, synapse_store): + def test_tidy_table_no_manifest_uploaded(self, synapse_store): """ Test to ensure that the table can be tidied without issue when a DatasetFileView object is instantiated based on a dataset that has files annotated but no manifest uploaded. Covers the case where a user validates a manifest with schematic, and annotates the files with a non-schematic tool (ie the R client), and then tries to generate a manifest for the dataset with schematic. """ - # GIVEN a dataset that has files annotated but no manifest uplodaded + # GIVEN a dataset that has files annotated (including the eTag annotation) but no manifest uplodaded dataset_id = "syn64019998" + + # AND the expected metadata from the files in the dataset + expected_metadata = pd.DataFrame( + { + "Component": { + 0: nan, + 1: "BulkRNA-seqAssay", + 2: "BulkRNA-seqAssay", + 3: "BulkRNA-seqAssay", + 4: "BulkRNA-seqAssay", + }, + "FileFormat": {0: nan, 1: "BAM", 2: "BAM", 3: "BAM", 4: "BAM"}, + "GenomeBuild": { + 0: nan, + 1: "GRCh37", + 2: "GRCh37", + 3: "GRCh37", + 4: "GRCh37", + }, + "entityId": { + 0: "syn64019999", + 1: "syn64020000", + 2: "syn64020001", + 3: "syn64020002", + 4: "syn64020003", + }, + }, + ).set_index("entityId", drop=False) + # WHEN a DatasetFileView object is instantiated based on the dataset dataset_fileview = DatasetFileView(dataset_id, synapse_store.syn) - # AND the fileview is queried + + # AND the fileview is queried without being tidied table = dataset_fileview.query(tidy=False, force=True) + # THEN a table should be present assert isinstance(table, pd.DataFrame) + # AND the table should not be empty assert not table.empty - # AND the table should already include the eTag column + + # AND the table should already include the eTag column that will be removed and saved for comparison later assert "eTag" in table.columns - original_etag_colum = table["eTag"] - # AND the table should be able to be tidied without an exception being raised + original_etag_colum = table.pop("eTag") + + # AND when the table is tidied no exception should be raised with does_not_raise(): table = dataset_fileview.tidy_table() - # AND the expected metadata should be present in the table # AND the eTag column should be different from the original eTag column - assert (table["eTag"] != original_etag_colum).all() + new_etag_column = table.pop("eTag").reset_index(drop=True) + assert (new_etag_column != original_etag_colum).all() + + # AND the expected metadata should be present in the table + assert_frame_equal(table, expected_metadata) @pytest.mark.table_operations From 6d0cdd044e3d66be3674f790d04e2eda4e6080f3 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Mon, 11 Nov 2024 15:34:10 -0700 Subject: [PATCH 3/9] remove unused import --- tests/test_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_store.py b/tests/test_store.py index 9e1e98b13..5b5d6e11c 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -16,7 +16,7 @@ import pandas as pd import pytest from numpy import nan -from pandas.testing import assert_frame_equal, assert_series_equal +from pandas.testing import assert_frame_equal from synapseclient import EntityViewSchema, Folder from synapseclient.core.exceptions import SynapseHTTPError from synapseclient.entity import File, Project From 37cd4cd67699dc46eedbadd95cb26f58b65eb8e8 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Mon, 11 Nov 2024 15:35:17 -0700 Subject: [PATCH 4/9] remove etag column if already present when building temp file view --- schematic/store/synapse.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index 4c1b900a8..1062c0c95 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -3508,6 +3508,12 @@ def _fix_default_columns(self): # Rename ROW_ETAG column to eTag and place at end of data frame if "ROW_ETAG" in self.table: row_etags = self.table.pop("ROW_ETAG") + + # eTag column may already present if users annotated data without submitting manifest + # we're only concerned with the new values and not the existing ones + if "eTag" in self.table: + del self.table["eTag"] + self.table.insert(len(self.table.columns), "eTag", row_etags) return self.table From 68b0b244ef97f0705ae33eee925cfe72100330e9 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Tue, 12 Nov 2024 10:07:34 -0700 Subject: [PATCH 5/9] catch all exceptions to switch to sequential mode --- schematic/store/synapse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index 1062c0c95..29457fb3d 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -2825,7 +2825,7 @@ def getDatasetAnnotations( try: logger.info("Trying batch mode for retrieving Synapse annotations") table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids) - except (SynapseAuthenticationError, SynapseHTTPError): + except: logger.info( f"Unable to create a temporary file view bound to {datasetId}. " "Defaulting to slower iterative retrieval of annotations." From 255e3c0bf80db98647b4efa77631fb051c481c17 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Tue, 12 Nov 2024 10:17:10 -0700 Subject: [PATCH 6/9] update test for updated data --- tests/test_store.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/tests/test_store.py b/tests/test_store.py index 5b5d6e11c..eeb2276f7 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -1006,26 +1006,23 @@ def test_tidy_table_no_manifest_uploaded(self, synapse_store): expected_metadata = pd.DataFrame( { "Component": { - 0: nan, + 0: "BulkRNA-seqAssay", 1: "BulkRNA-seqAssay", 2: "BulkRNA-seqAssay", 3: "BulkRNA-seqAssay", - 4: "BulkRNA-seqAssay", }, - "FileFormat": {0: nan, 1: "BAM", 2: "BAM", 3: "BAM", 4: "BAM"}, + "FileFormat": {0: "BAM", 1: "BAM", 2: "BAM", 3: "BAM"}, "GenomeBuild": { - 0: nan, + 0: "GRCh37", 1: "GRCh37", 2: "GRCh37", 3: "GRCh37", - 4: "GRCh37", }, "entityId": { - 0: "syn64019999", - 1: "syn64020000", - 2: "syn64020001", - 3: "syn64020002", - 4: "syn64020003", + 0: "syn64020000", + 1: "syn64020001", + 2: "syn64020002", + 3: "syn64020003", }, }, ).set_index("entityId", drop=False) From 6c122ee308ec2035055fdd10c589d48649084a0b Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Tue, 12 Nov 2024 10:33:40 -0700 Subject: [PATCH 7/9] Revert "update test for updated data" This reverts commit 255e3c0bf80db98647b4efa77631fb051c481c17. --- tests/test_store.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/test_store.py b/tests/test_store.py index eeb2276f7..5b5d6e11c 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -1006,23 +1006,26 @@ def test_tidy_table_no_manifest_uploaded(self, synapse_store): expected_metadata = pd.DataFrame( { "Component": { - 0: "BulkRNA-seqAssay", + 0: nan, 1: "BulkRNA-seqAssay", 2: "BulkRNA-seqAssay", 3: "BulkRNA-seqAssay", + 4: "BulkRNA-seqAssay", }, - "FileFormat": {0: "BAM", 1: "BAM", 2: "BAM", 3: "BAM"}, + "FileFormat": {0: nan, 1: "BAM", 2: "BAM", 3: "BAM", 4: "BAM"}, "GenomeBuild": { - 0: "GRCh37", + 0: nan, 1: "GRCh37", 2: "GRCh37", 3: "GRCh37", + 4: "GRCh37", }, "entityId": { - 0: "syn64020000", - 1: "syn64020001", - 2: "syn64020002", - 3: "syn64020003", + 0: "syn64019999", + 1: "syn64020000", + 2: "syn64020001", + 3: "syn64020002", + 4: "syn64020003", }, }, ).set_index("entityId", drop=False) From ad71660c6298094a177fdf975d12a311fe7c4b92 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Tue, 12 Nov 2024 11:02:34 -0700 Subject: [PATCH 8/9] Revert "catch all exceptions to switch to sequential mode" This reverts commit 68b0b244ef97f0705ae33eee925cfe72100330e9. --- schematic/store/synapse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index 29457fb3d..1062c0c95 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -2825,7 +2825,7 @@ def getDatasetAnnotations( try: logger.info("Trying batch mode for retrieving Synapse annotations") table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids) - except: + except (SynapseAuthenticationError, SynapseHTTPError): logger.info( f"Unable to create a temporary file view bound to {datasetId}. " "Defaulting to slower iterative retrieval of annotations." From cc1fb27ec4a58064ef90ec8df4a62ce1fc418588 Mon Sep 17 00:00:00 2001 From: GiaJordan Date: Tue, 12 Nov 2024 12:48:36 -0700 Subject: [PATCH 9/9] catch ValueErrors as well --- schematic/store/synapse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index 1062c0c95..33b811e84 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -2825,7 +2825,7 @@ def getDatasetAnnotations( try: logger.info("Trying batch mode for retrieving Synapse annotations") table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids) - except (SynapseAuthenticationError, SynapseHTTPError): + except (SynapseAuthenticationError, SynapseHTTPError, ValueError): logger.info( f"Unable to create a temporary file view bound to {datasetId}. " "Defaulting to slower iterative retrieval of annotations."