[wip] copy cram tests for sqlite engine in existing files

nextstrain · Jul 2, 2022 · 2f4611d · 2f4611d
1 parent 64bbe99
commit 2f4611d
Show file tree

Hide file tree

Showing 14 changed files with 313 additions and 4 deletions.
diff --git a/tests/functional/filter/cram/filter-exclude-include.t b/tests/functional/filter/cram/filter-exclude-include.t
@@ -7,6 +7,9 @@ Filter with exclude query for two regions that comprise all but one strain.
 This filter should leave a single record from Oceania.
 Force include one South American record by country to get two total records.
 
+Pandas engine
+-------------
+
   $ ${AUGUR} filter \
   >  --metadata filter/data/metadata.tsv \
   >  --exclude-where "region=South America" "region=North America" "region=Southeast Asia" \
@@ -15,3 +18,15 @@ Force include one South American record by country to get two total records.
   $ wc -l "$TMP/filtered_strains.txt"
   \s*2 .* (re)
   $ rm -f "$TMP/filtered_strains.txt"
+
+SQLite engine
+-------------
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --metadata filter/data/metadata.tsv \
+  >  --exclude-where "region=South America" "region=North America" "region=Southeast Asia" \
+  >  --include-where "country=Ecuador" \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  $ wc -l "$TMP/filtered_strains.txt"
+  \s*2 .* (re)
+  $ rm -f "$TMP/filtered_strains.txt"
diff --git a/tests/functional/filter/cram/filter-metadata-duplicates-error.t b/tests/functional/filter/cram/filter-metadata-duplicates-error.t
@@ -3,7 +3,12 @@ Setup
   $ pushd "$TESTDIR" > /dev/null
   $ source _setup.sh
 
-Error on duplicates in metadata within same chunk.
+Error on duplicates in metadata.
+
+Pandas engine
+-------------
+
+Within same chunk:
 
   $ cat >$TMP/metadata-duplicates.tsv <<~~
   > strain	date
@@ -26,7 +31,7 @@ Error on duplicates in metadata within same chunk.
   cat: .*: No such file or directory (re)
   [1]
 
-Error on duplicates in metadata in separate chunks.
+Separate chunks:
 
   $ ${AUGUR} filter \
   >   --metadata $TMP/metadata-duplicates.tsv \
@@ -40,3 +45,18 @@ Error on duplicates in metadata in separate chunks.
   $ cat $TMP/metadata-filtered.tsv
   cat: .*: No such file or directory (re)
   [1]
+
+SQLite engine
+-------------
+
+  $ ${AUGUR} filter --engine sqlite \
+  >   --metadata $TMP/metadata-duplicates.tsv \
+  >   --group-by year \
+  >   --sequences-per-group 2 \
+  >   --subsample-seed 0 \
+  >   --output-metadata $TMP/metadata-filtered.tsv > /dev/null
+  ERROR: Duplicate found in .* (re)
+  [2]
+  $ cat $TMP/metadata-filtered.tsv
+  cat: .*: No such file or directory (re)
+  [1]
diff --git a/tests/functional/filter/cram/filter-metadata-not-found-error.t b/tests/functional/filter/cram/filter-metadata-not-found-error.t
@@ -5,10 +5,24 @@ Setup
 
 Try to filter on an metadata file that does not exist.
 
+Pandas engine
+-------------
+
   $ ${AUGUR} filter \
   >  --metadata file-does-not-exist.tsv \
   >  --group-by year month \
   >  --sequences-per-group 1 \
   >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
   ERROR: No such file or directory: 'file-does-not-exist.tsv'
   [2]
+
+SQLite engine
+-------------
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --metadata file-does-not-exist.tsv \
+  >  --group-by year month \
+  >  --sequences-per-group 1 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  ERROR: No such file or directory: 'file-does-not-exist.tsv'
+  [2]
diff --git a/tests/functional/filter/cram/filter-metadata-sequence-strains-mismatch.t b/tests/functional/filter/cram/filter-metadata-sequence-strains-mismatch.t
@@ -9,6 +9,9 @@ The metadata are missing one strain that has a sequence.
 The list of strains to include has one strain with no metadata/sequence and one strain with information that would have been filtered by country.
 The query initially filters 3 strains from Colombia, one of which is added back by the include.
 
+Pandas engine
+-------------
+
   $ ${AUGUR} filter \
   >  --sequence-index filter/data/sequence_index.tsv \
   >  --metadata filter/data/metadata.tsv \
@@ -27,3 +30,25 @@ The query initially filters 3 strains from Colombia, one of which is added back
 
   $ diff -u <(sort -k 1,1 filter/data/filtered_log.tsv) <(sort -k 1,1 "$TMP/filtered_log.tsv")
   $ rm -f "$TMP/filtered_strains.txt"
+
+SQLite engine
+-------------
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --sequence-index filter/data/sequence_index.tsv \
+  >  --metadata filter/data/metadata.tsv \
+  >  --query "country != 'Colombia'" \
+  >  --non-nucleotide \
+  >  --exclude-ambiguous-dates-by year \
+  >  --include filter/data/include.txt \
+  >  --output-strains "$TMP/filtered_strains.txt" \
+  >  --output-log "$TMP/filtered_log.tsv"
+  4 strains were dropped during filtering
+  \t1 had no metadata (esc)
+  \t1 had no sequence data (esc)
+  \t3 of these were filtered out by the query: "country != 'Colombia'" (esc)
+  \t1 strains were added back because they were in filter/data/include.txt (esc)
+  9 strains passed all filters
+
+  $ diff -u <(sort -k 1,1 filter/data/filtered_log.tsv) <(sort -k 1,1 "$TMP/filtered_log.tsv")
+  $ rm -f "$TMP/filtered_strains.txt"
diff --git a/tests/functional/filter/cram/filter-min-date.t b/tests/functional/filter/cram/filter-min-date.t
@@ -6,8 +6,20 @@ Setup
 Filter using only metadata without a sequence index.
 This should work because the requested filters don't rely on sequence information.
 
+Pandas engine
+-------------
+
   $ ${AUGUR} filter \
   >  --metadata filter/data/metadata.tsv \
   >  --min-date 2012 \
   >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
   $ rm -f "$TMP/filtered_strains.txt"
+
+SQLite engine
+-------------
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --metadata filter/data/metadata.tsv \
+  >  --min-date 2012 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  $ rm -f "$TMP/filtered_strains.txt"
diff --git a/tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t b/tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t
@@ -6,9 +6,22 @@ Setup
 Try to filter using only metadata without a sequence index.
 This should fail because the requested filters rely on sequence information.
 
+Pandas engine
+-------------
+
   $ ${AUGUR} filter \
   >  --metadata filter/data/metadata.tsv \
   >  --min-length 10000 \
   >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
   ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
   [2]
+
+SQLite engine
+-------------
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --metadata filter/data/metadata.tsv \
+  >  --min-length 10000 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
+  [2]
diff --git a/tests/functional/filter/cram/filter-min-length-output-metadata.t b/tests/functional/filter/cram/filter-min-length-output-metadata.t
@@ -4,16 +4,30 @@ Setup
   $ source _setup.sh
 
 Filter using only metadata without sequence input or output and save results as filtered metadata.
+Output should include the 8 sequences matching the filters and a header line.
+
+Pandas engine
+-------------
 
   $ ${AUGUR} filter \
   >  --sequence-index filter/data/sequence_index.tsv \
   >  --metadata filter/data/metadata.tsv \
   >  --min-date 2012 \
   >  --min-length 10500 \
   >  --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null
+  $ wc -l "$TMP/filtered_metadata.tsv"
+  \s*9 .* (re)
+  $ rm -f "$TMP/filtered_metadata.tsv"
 
-Output should include the 8 sequences matching the filters and a header line.
+SQLite engine
+-------------
 
+  $ ${AUGUR} filter --engine sqlite \
+  >  --sequence-index filter/data/sequence_index.tsv \
+  >  --metadata filter/data/metadata.tsv \
+  >  --min-date 2012 \
+  >  --min-length 10500 \
+  >  --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null
   $ wc -l "$TMP/filtered_metadata.tsv"
   \s*9 .* (re)
   $ rm -f "$TMP/filtered_metadata.tsv"
diff --git a/tests/functional/filter/cram/filter-min-length-output-strains.t b/tests/functional/filter/cram/filter-min-length-output-strains.t
@@ -4,16 +4,30 @@ Setup
   $ source _setup.sh
 
 Filter using only metadata and save results as a list of filtered strains.
+Output should include only the 8 sequences matching the filters (without a header line).
+
+Pandas engine
+-------------
 
   $ ${AUGUR} filter \
   >  --sequence-index filter/data/sequence_index.tsv \
   >  --metadata filter/data/metadata.tsv \
   >  --min-date 2012 \
   >  --min-length 10500 \
   >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  $ wc -l "$TMP/filtered_strains.txt"
+  \s*8 .* (re)
+  $ rm -f "$TMP/filtered_strains.txt"
 
-Output should include only the 8 sequences matching the filters (without a header line).
+SQLite engine
+-------------
 
+  $ ${AUGUR} filter --engine sqlite \
+  >  --sequence-index filter/data/sequence_index.tsv \
+  >  --metadata filter/data/metadata.tsv \
+  >  --min-date 2012 \
+  >  --min-length 10500 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
   $ wc -l "$TMP/filtered_strains.txt"
   \s*8 .* (re)
   $ rm -f "$TMP/filtered_strains.txt"
diff --git a/tests/functional/filter/cram/filter-min-max-date-output.t b/tests/functional/filter/cram/filter-min-max-date-output.t
@@ -5,6 +5,9 @@ Setup
 
 Check output of min/max date filters.
 
+Pandas engine
+-------------
+
   $ ${AUGUR} filter \
   >  --metadata filter/data/metadata.tsv \
   >  --min-date 2015-01-01 \
@@ -14,3 +17,16 @@ Check output of min/max date filters.
   \t1 of these were dropped because they were earlier than 2015.0 or missing a date (esc)
   \t7 of these were dropped because they were later than 2016.09 or missing a date (esc)
   4 strains passed all filters
+
+SQLite engine
+-------------
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --metadata filter/data/metadata.tsv \
+  >  --min-date 2015-01-01 \
+  >  --max-date 2016-02-01 \
+  >  --output-metadata "$TMP/filtered_metadata.tsv"
+  8 strains were dropped during filtering
+  \t1 of these were dropped because they were earlier than 2015.0 or missing a date (esc)
+  \t7 of these were dropped because they were later than 2016.09 or missing a date (esc)
+  4 strains passed all filters
diff --git a/tests/functional/filter/cram/filter-mismatched-sequences-error.t b/tests/functional/filter/cram/filter-mismatched-sequences-error.t
@@ -3,6 +3,9 @@ Setup
   $ pushd "$TESTDIR" > /dev/null
   $ source _setup.sh
 
+Pandas engine
+-------------
+
 Try to filter with sequences that don't match any of the metadata.
 This should produce no results because the intersection of metadata and sequences is empty.
 
@@ -51,3 +54,55 @@ Since we expect metadata to be filtered by presence of strains in input sequence
   $ wc -l "$TMP/filtered_strains.txt"
   \s*0 .* (re)
   $ rm -f "$TMP/filtered_strains.txt"
+
+SQLite engine
+-------------
+
+Try to filter with sequences that don't match any of the metadata.
+This should produce no results because the intersection of metadata and sequences is empty.
+
+  $ echo -e ">something\nATCG" > "$TMP/dummy.fasta"
+  $ ${AUGUR} filter --engine sqlite \
+  >  --sequences "$TMP/dummy.fasta" \
+  >  --metadata filter/data/metadata.tsv \
+  >  --min-length 4 \
+  >  --max-date 2020-01-30 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
+  ERROR: All samples have been dropped! Check filter rules and metadata file format.
+  [2]
+  $ wc -l "$TMP/filtered_strains.txt"
+  \s*0 .* (re)
+  $ rm -f "$TMP/filtered_strains.txt"
+
+Repeat with sequence and strain outputs. We should get the same results.
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --sequences "$TMP/dummy.fasta" \
+  >  --metadata filter/data/metadata.tsv \
+  >  --max-date 2020-01-30 \
+  >  --output-strains "$TMP/filtered_strains.txt" \
+  >  --output-sequences "$TMP/filtered.fasta" > /dev/null
+  Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
+  ERROR: All samples have been dropped! Check filter rules and metadata file format.
+  [2]
+  $ wc -l "$TMP/filtered_strains.txt"
+  \s*0 .* (re)
+  $ grep "^>" "$TMP/filtered.fasta" | wc -l
+  \s*0 (re)
+  $ rm -f "$TMP/filtered_strains.txt"
+  $ rm -f "$TMP/filtered.fasta"
+
+Repeat without any sequence-based filters.
+Since we expect metadata to be filtered by presence of strains in input sequences, this should produce no results because the intersection of metadata and sequences is empty.
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --sequences "$TMP/dummy.fasta" \
+  >  --metadata filter/data/metadata.tsv \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
+  ERROR: All samples have been dropped! Check filter rules and metadata file format.
+  [2]
+  $ wc -l "$TMP/filtered_strains.txt"
+  \s*0 .* (re)
+  $ rm -f "$TMP/filtered_strains.txt"
diff --git a/tests/functional/filter/cram/filter-no-outputs-error.t b/tests/functional/filter/cram/filter-no-outputs-error.t
@@ -5,9 +5,22 @@ Setup
 
 Try to filter without any outputs.
 
+Pandas engine
+-------------
+
   $ ${AUGUR} filter \
   >  --sequence-index filter/data/sequence_index.tsv \
   >  --metadata filter/data/metadata.tsv \
   >  --min-length 10000 > /dev/null
   ERROR: You need to select at least one output.
   [2]
+
+SQLite engine
+-------------
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --sequence-index filter/data/sequence_index.tsv \
+  >  --metadata filter/data/metadata.tsv \
+  >  --min-length 10000 > /dev/null
+  ERROR: You need to select at least one output.
+  [2]
diff --git a/tests/functional/filter/cram/filter-output-directory-not-found-error.t b/tests/functional/filter/cram/filter-output-directory-not-found-error.t
@@ -5,10 +5,24 @@ Setup
 
 Try to output to a directory that does not exist.
 
+Pandas engine
+-------------
+
   $ ${AUGUR} filter \
   >  --metadata filter/data/metadata.tsv \
   >  --group-by year month \
   >  --sequences-per-group 1 \
   >  --output-strains "directory-does-not-exist/filtered_strains.txt" > /dev/null
   ERROR: No such file or directory: 'directory-does-not-exist/filtered_strains.txt'
   [2]
+
+SQLite engine
+-------------
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --metadata filter/data/metadata.tsv \
+  >  --group-by year month \
+  >  --sequences-per-group 1 \
+  >  --output-strains "directory-does-not-exist/filtered_strains.txt" > /dev/null
+  ERROR: No such file or directory: 'directory-does-not-exist/filtered_strains.txt'
+  [2]