From 2f4611daa856980d41723a091e56965ea5ad45f9 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Fri, 27 May 2022 18:16:21 -0700 Subject: [PATCH] [wip] copy cram tests for sqlite engine in existing files --- .../filter/cram/filter-exclude-include.t | 15 ++++ .../cram/filter-metadata-duplicates-error.t | 24 ++++++- .../cram/filter-metadata-not-found-error.t | 14 ++++ ...ilter-metadata-sequence-strains-mismatch.t | 25 +++++++ .../functional/filter/cram/filter-min-date.t | 12 ++++ ...ilter-min-length-no-sequence-index-error.t | 13 ++++ .../cram/filter-min-length-output-metadata.t | 16 ++++- .../cram/filter-min-length-output-strains.t | 16 ++++- .../filter/cram/filter-min-max-date-output.t | 16 +++++ .../cram/filter-mismatched-sequences-error.t | 55 +++++++++++++++ .../filter/cram/filter-no-outputs-error.t | 13 ++++ .../filter-output-directory-not-found-error.t | 14 ++++ .../filter-output-strains-no-sequence-error.t | 14 ++++ .../filter/cram/filter-query-example.t | 70 +++++++++++++++++++ 14 files changed, 313 insertions(+), 4 deletions(-) diff --git a/tests/functional/filter/cram/filter-exclude-include.t b/tests/functional/filter/cram/filter-exclude-include.t index a00ee82c1..60050c640 100644 --- a/tests/functional/filter/cram/filter-exclude-include.t +++ b/tests/functional/filter/cram/filter-exclude-include.t @@ -7,6 +7,9 @@ Filter with exclude query for two regions that comprise all but one strain. This filter should leave a single record from Oceania. Force include one South American record by country to get two total records. +Pandas engine +------------- + $ ${AUGUR} filter \ > --metadata filter/data/metadata.tsv \ > --exclude-where "region=South America" "region=North America" "region=Southeast Asia" \ @@ -15,3 +18,15 @@ Force include one South American record by country to get two total records. $ wc -l "$TMP/filtered_strains.txt" \s*2 .* (re) $ rm -f "$TMP/filtered_strains.txt" + +SQLite engine +------------- + + $ ${AUGUR} filter --engine sqlite \ + > --metadata filter/data/metadata.tsv \ + > --exclude-where "region=South America" "region=North America" "region=Southeast Asia" \ + > --include-where "country=Ecuador" \ + > --output-strains "$TMP/filtered_strains.txt" > /dev/null + $ wc -l "$TMP/filtered_strains.txt" + \s*2 .* (re) + $ rm -f "$TMP/filtered_strains.txt" diff --git a/tests/functional/filter/cram/filter-metadata-duplicates-error.t b/tests/functional/filter/cram/filter-metadata-duplicates-error.t index f082386d2..3c8ce763a 100644 --- a/tests/functional/filter/cram/filter-metadata-duplicates-error.t +++ b/tests/functional/filter/cram/filter-metadata-duplicates-error.t @@ -3,7 +3,12 @@ Setup $ pushd "$TESTDIR" > /dev/null $ source _setup.sh -Error on duplicates in metadata within same chunk. +Error on duplicates in metadata. + +Pandas engine +------------- + +Within same chunk: $ cat >$TMP/metadata-duplicates.tsv <<~~ > strain date @@ -26,7 +31,7 @@ Error on duplicates in metadata within same chunk. cat: .*: No such file or directory (re) [1] -Error on duplicates in metadata in separate chunks. +Separate chunks: $ ${AUGUR} filter \ > --metadata $TMP/metadata-duplicates.tsv \ @@ -40,3 +45,18 @@ Error on duplicates in metadata in separate chunks. $ cat $TMP/metadata-filtered.tsv cat: .*: No such file or directory (re) [1] + +SQLite engine +------------- + + $ ${AUGUR} filter --engine sqlite \ + > --metadata $TMP/metadata-duplicates.tsv \ + > --group-by year \ + > --sequences-per-group 2 \ + > --subsample-seed 0 \ + > --output-metadata $TMP/metadata-filtered.tsv > /dev/null + ERROR: Duplicate found in .* (re) + [2] + $ cat $TMP/metadata-filtered.tsv + cat: .*: No such file or directory (re) + [1] \ No newline at end of file diff --git a/tests/functional/filter/cram/filter-metadata-not-found-error.t b/tests/functional/filter/cram/filter-metadata-not-found-error.t index b39614460..98c56dd0e 100644 --- a/tests/functional/filter/cram/filter-metadata-not-found-error.t +++ b/tests/functional/filter/cram/filter-metadata-not-found-error.t @@ -5,6 +5,9 @@ Setup Try to filter on an metadata file that does not exist. +Pandas engine +------------- + $ ${AUGUR} filter \ > --metadata file-does-not-exist.tsv \ > --group-by year month \ @@ -12,3 +15,14 @@ Try to filter on an metadata file that does not exist. > --output-strains "$TMP/filtered_strains.txt" > /dev/null ERROR: No such file or directory: 'file-does-not-exist.tsv' [2] + +SQLite engine +------------- + + $ ${AUGUR} filter --engine sqlite \ + > --metadata file-does-not-exist.tsv \ + > --group-by year month \ + > --sequences-per-group 1 \ + > --output-strains "$TMP/filtered_strains.txt" > /dev/null + ERROR: No such file or directory: 'file-does-not-exist.tsv' + [2] diff --git a/tests/functional/filter/cram/filter-metadata-sequence-strains-mismatch.t b/tests/functional/filter/cram/filter-metadata-sequence-strains-mismatch.t index adacdd7b9..17fef9bd3 100644 --- a/tests/functional/filter/cram/filter-metadata-sequence-strains-mismatch.t +++ b/tests/functional/filter/cram/filter-metadata-sequence-strains-mismatch.t @@ -9,6 +9,9 @@ The metadata are missing one strain that has a sequence. The list of strains to include has one strain with no metadata/sequence and one strain with information that would have been filtered by country. The query initially filters 3 strains from Colombia, one of which is added back by the include. +Pandas engine +------------- + $ ${AUGUR} filter \ > --sequence-index filter/data/sequence_index.tsv \ > --metadata filter/data/metadata.tsv \ @@ -27,3 +30,25 @@ The query initially filters 3 strains from Colombia, one of which is added back $ diff -u <(sort -k 1,1 filter/data/filtered_log.tsv) <(sort -k 1,1 "$TMP/filtered_log.tsv") $ rm -f "$TMP/filtered_strains.txt" + +SQLite engine +------------- + + $ ${AUGUR} filter --engine sqlite \ + > --sequence-index filter/data/sequence_index.tsv \ + > --metadata filter/data/metadata.tsv \ + > --query "country != 'Colombia'" \ + > --non-nucleotide \ + > --exclude-ambiguous-dates-by year \ + > --include filter/data/include.txt \ + > --output-strains "$TMP/filtered_strains.txt" \ + > --output-log "$TMP/filtered_log.tsv" + 4 strains were dropped during filtering + \t1 had no metadata (esc) + \t1 had no sequence data (esc) + \t3 of these were filtered out by the query: "country != 'Colombia'" (esc) + \t1 strains were added back because they were in filter/data/include.txt (esc) + 9 strains passed all filters + + $ diff -u <(sort -k 1,1 filter/data/filtered_log.tsv) <(sort -k 1,1 "$TMP/filtered_log.tsv") + $ rm -f "$TMP/filtered_strains.txt" \ No newline at end of file diff --git a/tests/functional/filter/cram/filter-min-date.t b/tests/functional/filter/cram/filter-min-date.t index 5fe7dc5bb..a03ecaa38 100644 --- a/tests/functional/filter/cram/filter-min-date.t +++ b/tests/functional/filter/cram/filter-min-date.t @@ -6,8 +6,20 @@ Setup Filter using only metadata without a sequence index. This should work because the requested filters don't rely on sequence information. +Pandas engine +------------- + $ ${AUGUR} filter \ > --metadata filter/data/metadata.tsv \ > --min-date 2012 \ > --output-strains "$TMP/filtered_strains.txt" > /dev/null $ rm -f "$TMP/filtered_strains.txt" + +SQLite engine +------------- + + $ ${AUGUR} filter --engine sqlite \ + > --metadata filter/data/metadata.tsv \ + > --min-date 2012 \ + > --output-strains "$TMP/filtered_strains.txt" > /dev/null + $ rm -f "$TMP/filtered_strains.txt" diff --git a/tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t b/tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t index d360c8a54..c1f7c9710 100644 --- a/tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t +++ b/tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t @@ -6,9 +6,22 @@ Setup Try to filter using only metadata without a sequence index. This should fail because the requested filters rely on sequence information. +Pandas engine +------------- + $ ${AUGUR} filter \ > --metadata filter/data/metadata.tsv \ > --min-length 10000 \ > --output-strains "$TMP/filtered_strains.txt" > /dev/null ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information. [2] + +SQLite engine +------------- + + $ ${AUGUR} filter --engine sqlite \ + > --metadata filter/data/metadata.tsv \ + > --min-length 10000 \ + > --output-strains "$TMP/filtered_strains.txt" > /dev/null + ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information. + [2] diff --git a/tests/functional/filter/cram/filter-min-length-output-metadata.t b/tests/functional/filter/cram/filter-min-length-output-metadata.t index 320706812..a8d7cbf53 100644 --- a/tests/functional/filter/cram/filter-min-length-output-metadata.t +++ b/tests/functional/filter/cram/filter-min-length-output-metadata.t @@ -4,6 +4,10 @@ Setup $ source _setup.sh Filter using only metadata without sequence input or output and save results as filtered metadata. +Output should include the 8 sequences matching the filters and a header line. + +Pandas engine +------------- $ ${AUGUR} filter \ > --sequence-index filter/data/sequence_index.tsv \ @@ -11,9 +15,19 @@ Filter using only metadata without sequence input or output and save results as > --min-date 2012 \ > --min-length 10500 \ > --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null + $ wc -l "$TMP/filtered_metadata.tsv" + \s*9 .* (re) + $ rm -f "$TMP/filtered_metadata.tsv" -Output should include the 8 sequences matching the filters and a header line. +SQLite engine +------------- + $ ${AUGUR} filter --engine sqlite \ + > --sequence-index filter/data/sequence_index.tsv \ + > --metadata filter/data/metadata.tsv \ + > --min-date 2012 \ + > --min-length 10500 \ + > --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null $ wc -l "$TMP/filtered_metadata.tsv" \s*9 .* (re) $ rm -f "$TMP/filtered_metadata.tsv" diff --git a/tests/functional/filter/cram/filter-min-length-output-strains.t b/tests/functional/filter/cram/filter-min-length-output-strains.t index 50cbac6b1..9d8e329bb 100644 --- a/tests/functional/filter/cram/filter-min-length-output-strains.t +++ b/tests/functional/filter/cram/filter-min-length-output-strains.t @@ -4,6 +4,10 @@ Setup $ source _setup.sh Filter using only metadata and save results as a list of filtered strains. +Output should include only the 8 sequences matching the filters (without a header line). + +Pandas engine +------------- $ ${AUGUR} filter \ > --sequence-index filter/data/sequence_index.tsv \ @@ -11,9 +15,19 @@ Filter using only metadata and save results as a list of filtered strains. > --min-date 2012 \ > --min-length 10500 \ > --output-strains "$TMP/filtered_strains.txt" > /dev/null + $ wc -l "$TMP/filtered_strains.txt" + \s*8 .* (re) + $ rm -f "$TMP/filtered_strains.txt" -Output should include only the 8 sequences matching the filters (without a header line). +SQLite engine +------------- + $ ${AUGUR} filter --engine sqlite \ + > --sequence-index filter/data/sequence_index.tsv \ + > --metadata filter/data/metadata.tsv \ + > --min-date 2012 \ + > --min-length 10500 \ + > --output-strains "$TMP/filtered_strains.txt" > /dev/null $ wc -l "$TMP/filtered_strains.txt" \s*8 .* (re) $ rm -f "$TMP/filtered_strains.txt" diff --git a/tests/functional/filter/cram/filter-min-max-date-output.t b/tests/functional/filter/cram/filter-min-max-date-output.t index 0b09331b9..3aad7f542 100644 --- a/tests/functional/filter/cram/filter-min-max-date-output.t +++ b/tests/functional/filter/cram/filter-min-max-date-output.t @@ -5,6 +5,9 @@ Setup Check output of min/max date filters. +Pandas engine +------------- + $ ${AUGUR} filter \ > --metadata filter/data/metadata.tsv \ > --min-date 2015-01-01 \ @@ -14,3 +17,16 @@ Check output of min/max date filters. \t1 of these were dropped because they were earlier than 2015.0 or missing a date (esc) \t7 of these were dropped because they were later than 2016.09 or missing a date (esc) 4 strains passed all filters + +SQLite engine +------------- + + $ ${AUGUR} filter --engine sqlite \ + > --metadata filter/data/metadata.tsv \ + > --min-date 2015-01-01 \ + > --max-date 2016-02-01 \ + > --output-metadata "$TMP/filtered_metadata.tsv" + 8 strains were dropped during filtering + \t1 of these were dropped because they were earlier than 2015.0 or missing a date (esc) + \t7 of these were dropped because they were later than 2016.09 or missing a date (esc) + 4 strains passed all filters diff --git a/tests/functional/filter/cram/filter-mismatched-sequences-error.t b/tests/functional/filter/cram/filter-mismatched-sequences-error.t index 14da5e054..e4ce30c99 100644 --- a/tests/functional/filter/cram/filter-mismatched-sequences-error.t +++ b/tests/functional/filter/cram/filter-mismatched-sequences-error.t @@ -3,6 +3,9 @@ Setup $ pushd "$TESTDIR" > /dev/null $ source _setup.sh +Pandas engine +------------- + Try to filter with sequences that don't match any of the metadata. This should produce no results because the intersection of metadata and sequences is empty. @@ -51,3 +54,55 @@ Since we expect metadata to be filtered by presence of strains in input sequence $ wc -l "$TMP/filtered_strains.txt" \s*0 .* (re) $ rm -f "$TMP/filtered_strains.txt" + +SQLite engine +------------- + +Try to filter with sequences that don't match any of the metadata. +This should produce no results because the intersection of metadata and sequences is empty. + + $ echo -e ">something\nATCG" > "$TMP/dummy.fasta" + $ ${AUGUR} filter --engine sqlite \ + > --sequences "$TMP/dummy.fasta" \ + > --metadata filter/data/metadata.tsv \ + > --min-length 4 \ + > --max-date 2020-01-30 \ + > --output-strains "$TMP/filtered_strains.txt" > /dev/null + Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`. + ERROR: All samples have been dropped! Check filter rules and metadata file format. + [2] + $ wc -l "$TMP/filtered_strains.txt" + \s*0 .* (re) + $ rm -f "$TMP/filtered_strains.txt" + +Repeat with sequence and strain outputs. We should get the same results. + + $ ${AUGUR} filter --engine sqlite \ + > --sequences "$TMP/dummy.fasta" \ + > --metadata filter/data/metadata.tsv \ + > --max-date 2020-01-30 \ + > --output-strains "$TMP/filtered_strains.txt" \ + > --output-sequences "$TMP/filtered.fasta" > /dev/null + Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`. + ERROR: All samples have been dropped! Check filter rules and metadata file format. + [2] + $ wc -l "$TMP/filtered_strains.txt" + \s*0 .* (re) + $ grep "^>" "$TMP/filtered.fasta" | wc -l + \s*0 (re) + $ rm -f "$TMP/filtered_strains.txt" + $ rm -f "$TMP/filtered.fasta" + +Repeat without any sequence-based filters. +Since we expect metadata to be filtered by presence of strains in input sequences, this should produce no results because the intersection of metadata and sequences is empty. + + $ ${AUGUR} filter --engine sqlite \ + > --sequences "$TMP/dummy.fasta" \ + > --metadata filter/data/metadata.tsv \ + > --output-strains "$TMP/filtered_strains.txt" > /dev/null + Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`. + ERROR: All samples have been dropped! Check filter rules and metadata file format. + [2] + $ wc -l "$TMP/filtered_strains.txt" + \s*0 .* (re) + $ rm -f "$TMP/filtered_strains.txt" diff --git a/tests/functional/filter/cram/filter-no-outputs-error.t b/tests/functional/filter/cram/filter-no-outputs-error.t index 7eedec3e1..52e8866f5 100644 --- a/tests/functional/filter/cram/filter-no-outputs-error.t +++ b/tests/functional/filter/cram/filter-no-outputs-error.t @@ -5,9 +5,22 @@ Setup Try to filter without any outputs. +Pandas engine +------------- + $ ${AUGUR} filter \ > --sequence-index filter/data/sequence_index.tsv \ > --metadata filter/data/metadata.tsv \ > --min-length 10000 > /dev/null ERROR: You need to select at least one output. [2] + +SQLite engine +------------- + + $ ${AUGUR} filter --engine sqlite \ + > --sequence-index filter/data/sequence_index.tsv \ + > --metadata filter/data/metadata.tsv \ + > --min-length 10000 > /dev/null + ERROR: You need to select at least one output. + [2] diff --git a/tests/functional/filter/cram/filter-output-directory-not-found-error.t b/tests/functional/filter/cram/filter-output-directory-not-found-error.t index 4fc4131cb..55cdfb70a 100644 --- a/tests/functional/filter/cram/filter-output-directory-not-found-error.t +++ b/tests/functional/filter/cram/filter-output-directory-not-found-error.t @@ -5,6 +5,9 @@ Setup Try to output to a directory that does not exist. +Pandas engine +------------- + $ ${AUGUR} filter \ > --metadata filter/data/metadata.tsv \ > --group-by year month \ @@ -12,3 +15,14 @@ Try to output to a directory that does not exist. > --output-strains "directory-does-not-exist/filtered_strains.txt" > /dev/null ERROR: No such file or directory: 'directory-does-not-exist/filtered_strains.txt' [2] + +SQLite engine +------------- + + $ ${AUGUR} filter --engine sqlite \ + > --metadata filter/data/metadata.tsv \ + > --group-by year month \ + > --sequences-per-group 1 \ + > --output-strains "directory-does-not-exist/filtered_strains.txt" > /dev/null + ERROR: No such file or directory: 'directory-does-not-exist/filtered_strains.txt' + [2] diff --git a/tests/functional/filter/cram/filter-output-strains-no-sequence-error.t b/tests/functional/filter/cram/filter-output-strains-no-sequence-error.t index 2648c18b4..fd67b0ccc 100644 --- a/tests/functional/filter/cram/filter-output-strains-no-sequence-error.t +++ b/tests/functional/filter/cram/filter-output-strains-no-sequence-error.t @@ -6,6 +6,9 @@ Setup Try to filter with sequence outputs and no sequence inputs. This should fail. +Pandas engine +------------- + $ ${AUGUR} filter \ > --sequence-index filter/data/sequence_index.tsv \ > --metadata filter/data/metadata.tsv \ @@ -13,3 +16,14 @@ This should fail. > --output "$TMP/filtered.fasta" > /dev/null ERROR: You need to provide sequences to output sequences. [2] + +SQLite engine +------------- + + $ ${AUGUR} filter --engine sqlite \ + > --sequence-index filter/data/sequence_index.tsv \ + > --metadata filter/data/metadata.tsv \ + > --min-length 10000 \ + > --output "$TMP/filtered.fasta" > /dev/null + ERROR: You need to provide sequences to output sequences. + [2] diff --git a/tests/functional/filter/cram/filter-query-example.t b/tests/functional/filter/cram/filter-query-example.t index bbcfb8bf1..8883aaf14 100644 --- a/tests/functional/filter/cram/filter-query-example.t +++ b/tests/functional/filter/cram/filter-query-example.t @@ -3,6 +3,9 @@ Setup $ pushd "$TESTDIR" > /dev/null $ source _setup.sh +Pandas engine +------------- + Filter into two separate sets and then select sequences from the union of those sets. First, select strains from Brazil (there should be 1). @@ -66,3 +69,70 @@ Alternately, exclude the sequences from Brazil and Colombia (N=4) and records wi $ grep "^>" "$TMP/filtered.fasta" | wc -l \s*7 (re) $ rm -f "$TMP/filtered.fasta" + +SQLite engine +------------- + +Filter into two separate sets and then select sequences from the union of those sets. +First, select strains from Brazil (there should be 1). + + $ ${AUGUR} filter --engine sqlite \ + > --metadata filter/data/metadata.tsv \ + > --query "country == 'Brazil'" \ + > --output-strains "$TMP/filtered_strains.brazil.txt" > /dev/null + $ wc -l "$TMP/filtered_strains.brazil.txt" + \s*1 .* (re) + +Then, select strains from Colombia (there should be 3). + + $ ${AUGUR} filter --engine sqlite \ + > --metadata filter/data/metadata.tsv \ + > --query "country == 'Colombia'" \ + > --output-strains "$TMP/filtered_strains.colombia.txt" > /dev/null + $ wc -l "$TMP/filtered_strains.colombia.txt" + \s*3 .* (re) + +Finally, exclude all sequences except those from the two sets of strains (there should be 4). + + $ ${AUGUR} filter --engine sqlite \ + > --sequences filter/data/sequences.fasta \ + > --sequence-index filter/data/sequence_index.tsv \ + > --metadata filter/data/metadata.tsv \ + > --exclude-all \ + > --include "$TMP/filtered_strains.brazil.txt" "$TMP/filtered_strains.colombia.txt" \ + > --output "$TMP/filtered.fasta" > /dev/null + $ grep "^>" "$TMP/filtered.fasta" | wc -l + \s*4 (re) + $ rm -f "$TMP/filtered.fasta" + +Repeat this filter without a sequence index. +We should get the same outputs without building a sequence index on the fly, because the exclude-all flag tells us we only want to force-include strains and skip all other filters. + + $ ${AUGUR} filter --engine sqlite \ + > --sequences filter/data/sequences.fasta \ + > --metadata filter/data/metadata.tsv \ + > --exclude-all \ + > --include "$TMP/filtered_strains.brazil.txt" "$TMP/filtered_strains.colombia.txt" \ + > --output "$TMP/filtered.fasta" \ + > --output-metadata "$TMP/filtered.tsv" > /dev/null + $ grep "^>" "$TMP/filtered.fasta" | wc -l + \s*4 (re) + $ rm -f "$TMP/filtered.fasta" + +Metadata should have the same number of records as the sequences plus a header. + + $ wc -l "$TMP/filtered.tsv" + \s*5 .* (re) + $ rm -f "$TMP/filtered.tsv" + +Alternately, exclude the sequences from Brazil and Colombia (N=4) and records without sequences (N=1) or metadata (N=1). + + $ ${AUGUR} filter --engine sqlite \ + > --sequences filter/data/sequences.fasta \ + > --sequence-index filter/data/sequence_index.tsv \ + > --metadata filter/data/metadata.tsv \ + > --exclude "$TMP/filtered_strains.brazil.txt" "$TMP/filtered_strains.colombia.txt" \ + > --output "$TMP/filtered.fasta" > /dev/null + $ grep "^>" "$TMP/filtered.fasta" | wc -l + \s*7 (re) + $ rm -f "$TMP/filtered.fasta"