From 2f4611daa856980d41723a091e56965ea5ad45f9 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Fri, 27 May 2022 18:16:21 -0700
Subject: [PATCH] [wip] copy cram tests for sqlite engine in existing files

---
 .../filter/cram/filter-exclude-include.t      | 15 ++++
 .../cram/filter-metadata-duplicates-error.t   | 24 ++++++-
 .../cram/filter-metadata-not-found-error.t    | 14 ++++
 ...ilter-metadata-sequence-strains-mismatch.t | 25 +++++++
 .../functional/filter/cram/filter-min-date.t  | 12 ++++
 ...ilter-min-length-no-sequence-index-error.t | 13 ++++
 .../cram/filter-min-length-output-metadata.t  | 16 ++++-
 .../cram/filter-min-length-output-strains.t   | 16 ++++-
 .../filter/cram/filter-min-max-date-output.t  | 16 +++++
 .../cram/filter-mismatched-sequences-error.t  | 55 +++++++++++++++
 .../filter/cram/filter-no-outputs-error.t     | 13 ++++
 .../filter-output-directory-not-found-error.t | 14 ++++
 .../filter-output-strains-no-sequence-error.t | 14 ++++
 .../filter/cram/filter-query-example.t        | 70 +++++++++++++++++++
 14 files changed, 313 insertions(+), 4 deletions(-)

diff --git a/tests/functional/filter/cram/filter-exclude-include.t b/tests/functional/filter/cram/filter-exclude-include.t
index a00ee82c1..60050c640 100644
--- a/tests/functional/filter/cram/filter-exclude-include.t
+++ b/tests/functional/filter/cram/filter-exclude-include.t
@@ -7,6 +7,9 @@ Filter with exclude query for two regions that comprise all but one strain.
 This filter should leave a single record from Oceania.
 Force include one South American record by country to get two total records.
 
+Pandas engine
+-------------
+
   $ ${AUGUR} filter \
   >  --metadata filter/data/metadata.tsv \
   >  --exclude-where "region=South America" "region=North America" "region=Southeast Asia" \
@@ -15,3 +18,15 @@ Force include one South American record by country to get two total records.
   $ wc -l "$TMP/filtered_strains.txt"
   \s*2 .* (re)
   $ rm -f "$TMP/filtered_strains.txt"
+
+SQLite engine
+-------------
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --metadata filter/data/metadata.tsv \
+  >  --exclude-where "region=South America" "region=North America" "region=Southeast Asia" \
+  >  --include-where "country=Ecuador" \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  $ wc -l "$TMP/filtered_strains.txt"
+  \s*2 .* (re)
+  $ rm -f "$TMP/filtered_strains.txt"
diff --git a/tests/functional/filter/cram/filter-metadata-duplicates-error.t b/tests/functional/filter/cram/filter-metadata-duplicates-error.t
index f082386d2..3c8ce763a 100644
--- a/tests/functional/filter/cram/filter-metadata-duplicates-error.t
+++ b/tests/functional/filter/cram/filter-metadata-duplicates-error.t
@@ -3,7 +3,12 @@ Setup
   $ pushd "$TESTDIR" > /dev/null
   $ source _setup.sh
 
-Error on duplicates in metadata within same chunk.
+Error on duplicates in metadata.
+
+Pandas engine
+-------------
+
+Within same chunk:
 
   $ cat >$TMP/metadata-duplicates.tsv <<~~
   > strain	date
@@ -26,7 +31,7 @@ Error on duplicates in metadata within same chunk.
   cat: .*: No such file or directory (re)
   [1]
 
-Error on duplicates in metadata in separate chunks.
+Separate chunks:
 
   $ ${AUGUR} filter \
   >   --metadata $TMP/metadata-duplicates.tsv \
@@ -40,3 +45,18 @@ Error on duplicates in metadata in separate chunks.
   $ cat $TMP/metadata-filtered.tsv
   cat: .*: No such file or directory (re)
   [1]
+
+SQLite engine
+-------------
+
+  $ ${AUGUR} filter --engine sqlite \
+  >   --metadata $TMP/metadata-duplicates.tsv \
+  >   --group-by year \
+  >   --sequences-per-group 2 \
+  >   --subsample-seed 0 \
+  >   --output-metadata $TMP/metadata-filtered.tsv > /dev/null
+  ERROR: Duplicate found in .* (re)
+  [2]
+  $ cat $TMP/metadata-filtered.tsv
+  cat: .*: No such file or directory (re)
+  [1]
\ No newline at end of file
diff --git a/tests/functional/filter/cram/filter-metadata-not-found-error.t b/tests/functional/filter/cram/filter-metadata-not-found-error.t
index b39614460..98c56dd0e 100644
--- a/tests/functional/filter/cram/filter-metadata-not-found-error.t
+++ b/tests/functional/filter/cram/filter-metadata-not-found-error.t
@@ -5,6 +5,9 @@ Setup
 
 Try to filter on an metadata file that does not exist.
 
+Pandas engine
+-------------
+
   $ ${AUGUR} filter \
   >  --metadata file-does-not-exist.tsv \
   >  --group-by year month \
@@ -12,3 +15,14 @@ Try to filter on an metadata file that does not exist.
   >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
   ERROR: No such file or directory: 'file-does-not-exist.tsv'
   [2]
+
+SQLite engine
+-------------
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --metadata file-does-not-exist.tsv \
+  >  --group-by year month \
+  >  --sequences-per-group 1 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  ERROR: No such file or directory: 'file-does-not-exist.tsv'
+  [2]
diff --git a/tests/functional/filter/cram/filter-metadata-sequence-strains-mismatch.t b/tests/functional/filter/cram/filter-metadata-sequence-strains-mismatch.t
index adacdd7b9..17fef9bd3 100644
--- a/tests/functional/filter/cram/filter-metadata-sequence-strains-mismatch.t
+++ b/tests/functional/filter/cram/filter-metadata-sequence-strains-mismatch.t
@@ -9,6 +9,9 @@ The metadata are missing one strain that has a sequence.
 The list of strains to include has one strain with no metadata/sequence and one strain with information that would have been filtered by country.
 The query initially filters 3 strains from Colombia, one of which is added back by the include.
 
+Pandas engine
+-------------
+
   $ ${AUGUR} filter \
   >  --sequence-index filter/data/sequence_index.tsv \
   >  --metadata filter/data/metadata.tsv \
@@ -27,3 +30,25 @@ The query initially filters 3 strains from Colombia, one of which is added back
 
   $ diff -u <(sort -k 1,1 filter/data/filtered_log.tsv) <(sort -k 1,1 "$TMP/filtered_log.tsv")
   $ rm -f "$TMP/filtered_strains.txt"
+
+SQLite engine
+-------------
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --sequence-index filter/data/sequence_index.tsv \
+  >  --metadata filter/data/metadata.tsv \
+  >  --query "country != 'Colombia'" \
+  >  --non-nucleotide \
+  >  --exclude-ambiguous-dates-by year \
+  >  --include filter/data/include.txt \
+  >  --output-strains "$TMP/filtered_strains.txt" \
+  >  --output-log "$TMP/filtered_log.tsv"
+  4 strains were dropped during filtering
+  \t1 had no metadata (esc)
+  \t1 had no sequence data (esc)
+  \t3 of these were filtered out by the query: "country != 'Colombia'" (esc)
+  \t1 strains were added back because they were in filter/data/include.txt (esc)
+  9 strains passed all filters
+
+  $ diff -u <(sort -k 1,1 filter/data/filtered_log.tsv) <(sort -k 1,1 "$TMP/filtered_log.tsv")
+  $ rm -f "$TMP/filtered_strains.txt"
\ No newline at end of file
diff --git a/tests/functional/filter/cram/filter-min-date.t b/tests/functional/filter/cram/filter-min-date.t
index 5fe7dc5bb..a03ecaa38 100644
--- a/tests/functional/filter/cram/filter-min-date.t
+++ b/tests/functional/filter/cram/filter-min-date.t
@@ -6,8 +6,20 @@ Setup
 Filter using only metadata without a sequence index.
 This should work because the requested filters don't rely on sequence information.
 
+Pandas engine
+-------------
+
   $ ${AUGUR} filter \
   >  --metadata filter/data/metadata.tsv \
   >  --min-date 2012 \
   >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
   $ rm -f "$TMP/filtered_strains.txt"
+
+SQLite engine
+-------------
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --metadata filter/data/metadata.tsv \
+  >  --min-date 2012 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  $ rm -f "$TMP/filtered_strains.txt"
diff --git a/tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t b/tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t
index d360c8a54..c1f7c9710 100644
--- a/tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t
+++ b/tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t
@@ -6,9 +6,22 @@ Setup
 Try to filter using only metadata without a sequence index.
 This should fail because the requested filters rely on sequence information.
 
+Pandas engine
+-------------
+
   $ ${AUGUR} filter \
   >  --metadata filter/data/metadata.tsv \
   >  --min-length 10000 \
   >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
   ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
   [2]
+
+SQLite engine
+-------------
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --metadata filter/data/metadata.tsv \
+  >  --min-length 10000 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
+  [2]
diff --git a/tests/functional/filter/cram/filter-min-length-output-metadata.t b/tests/functional/filter/cram/filter-min-length-output-metadata.t
index 320706812..a8d7cbf53 100644
--- a/tests/functional/filter/cram/filter-min-length-output-metadata.t
+++ b/tests/functional/filter/cram/filter-min-length-output-metadata.t
@@ -4,6 +4,10 @@ Setup
   $ source _setup.sh
 
 Filter using only metadata without sequence input or output and save results as filtered metadata.
+Output should include the 8 sequences matching the filters and a header line.
+
+Pandas engine
+-------------
 
   $ ${AUGUR} filter \
   >  --sequence-index filter/data/sequence_index.tsv \
@@ -11,9 +15,19 @@ Filter using only metadata without sequence input or output and save results as
   >  --min-date 2012 \
   >  --min-length 10500 \
   >  --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null
+  $ wc -l "$TMP/filtered_metadata.tsv"
+  \s*9 .* (re)
+  $ rm -f "$TMP/filtered_metadata.tsv"
 
-Output should include the 8 sequences matching the filters and a header line.
+SQLite engine
+-------------
 
+  $ ${AUGUR} filter --engine sqlite \
+  >  --sequence-index filter/data/sequence_index.tsv \
+  >  --metadata filter/data/metadata.tsv \
+  >  --min-date 2012 \
+  >  --min-length 10500 \
+  >  --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null
   $ wc -l "$TMP/filtered_metadata.tsv"
   \s*9 .* (re)
   $ rm -f "$TMP/filtered_metadata.tsv"
diff --git a/tests/functional/filter/cram/filter-min-length-output-strains.t b/tests/functional/filter/cram/filter-min-length-output-strains.t
index 50cbac6b1..9d8e329bb 100644
--- a/tests/functional/filter/cram/filter-min-length-output-strains.t
+++ b/tests/functional/filter/cram/filter-min-length-output-strains.t
@@ -4,6 +4,10 @@ Setup
   $ source _setup.sh
 
 Filter using only metadata and save results as a list of filtered strains.
+Output should include only the 8 sequences matching the filters (without a header line).
+
+Pandas engine
+-------------
 
   $ ${AUGUR} filter \
   >  --sequence-index filter/data/sequence_index.tsv \
@@ -11,9 +15,19 @@ Filter using only metadata and save results as a list of filtered strains.
   >  --min-date 2012 \
   >  --min-length 10500 \
   >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  $ wc -l "$TMP/filtered_strains.txt"
+  \s*8 .* (re)
+  $ rm -f "$TMP/filtered_strains.txt"
 
-Output should include only the 8 sequences matching the filters (without a header line).
+SQLite engine
+-------------
 
+  $ ${AUGUR} filter --engine sqlite \
+  >  --sequence-index filter/data/sequence_index.tsv \
+  >  --metadata filter/data/metadata.tsv \
+  >  --min-date 2012 \
+  >  --min-length 10500 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
   $ wc -l "$TMP/filtered_strains.txt"
   \s*8 .* (re)
   $ rm -f "$TMP/filtered_strains.txt"
diff --git a/tests/functional/filter/cram/filter-min-max-date-output.t b/tests/functional/filter/cram/filter-min-max-date-output.t
index 0b09331b9..3aad7f542 100644
--- a/tests/functional/filter/cram/filter-min-max-date-output.t
+++ b/tests/functional/filter/cram/filter-min-max-date-output.t
@@ -5,6 +5,9 @@ Setup
 
 Check output of min/max date filters.
 
+Pandas engine
+-------------
+
   $ ${AUGUR} filter \
   >  --metadata filter/data/metadata.tsv \
   >  --min-date 2015-01-01 \
@@ -14,3 +17,16 @@ Check output of min/max date filters.
   \t1 of these were dropped because they were earlier than 2015.0 or missing a date (esc)
   \t7 of these were dropped because they were later than 2016.09 or missing a date (esc)
   4 strains passed all filters
+
+SQLite engine
+-------------
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --metadata filter/data/metadata.tsv \
+  >  --min-date 2015-01-01 \
+  >  --max-date 2016-02-01 \
+  >  --output-metadata "$TMP/filtered_metadata.tsv"
+  8 strains were dropped during filtering
+  \t1 of these were dropped because they were earlier than 2015.0 or missing a date (esc)
+  \t7 of these were dropped because they were later than 2016.09 or missing a date (esc)
+  4 strains passed all filters
diff --git a/tests/functional/filter/cram/filter-mismatched-sequences-error.t b/tests/functional/filter/cram/filter-mismatched-sequences-error.t
index 14da5e054..e4ce30c99 100644
--- a/tests/functional/filter/cram/filter-mismatched-sequences-error.t
+++ b/tests/functional/filter/cram/filter-mismatched-sequences-error.t
@@ -3,6 +3,9 @@ Setup
   $ pushd "$TESTDIR" > /dev/null
   $ source _setup.sh
 
+Pandas engine
+-------------
+
 Try to filter with sequences that don't match any of the metadata.
 This should produce no results because the intersection of metadata and sequences is empty.
 
@@ -51,3 +54,55 @@ Since we expect metadata to be filtered by presence of strains in input sequence
   $ wc -l "$TMP/filtered_strains.txt"
   \s*0 .* (re)
   $ rm -f "$TMP/filtered_strains.txt"
+
+SQLite engine
+-------------
+
+Try to filter with sequences that don't match any of the metadata.
+This should produce no results because the intersection of metadata and sequences is empty.
+
+  $ echo -e ">something\nATCG" > "$TMP/dummy.fasta"
+  $ ${AUGUR} filter --engine sqlite \
+  >  --sequences "$TMP/dummy.fasta" \
+  >  --metadata filter/data/metadata.tsv \
+  >  --min-length 4 \
+  >  --max-date 2020-01-30 \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
+  ERROR: All samples have been dropped! Check filter rules and metadata file format.
+  [2]
+  $ wc -l "$TMP/filtered_strains.txt"
+  \s*0 .* (re)
+  $ rm -f "$TMP/filtered_strains.txt"
+
+Repeat with sequence and strain outputs. We should get the same results.
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --sequences "$TMP/dummy.fasta" \
+  >  --metadata filter/data/metadata.tsv \
+  >  --max-date 2020-01-30 \
+  >  --output-strains "$TMP/filtered_strains.txt" \
+  >  --output-sequences "$TMP/filtered.fasta" > /dev/null
+  Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
+  ERROR: All samples have been dropped! Check filter rules and metadata file format.
+  [2]
+  $ wc -l "$TMP/filtered_strains.txt"
+  \s*0 .* (re)
+  $ grep "^>" "$TMP/filtered.fasta" | wc -l
+  \s*0 (re)
+  $ rm -f "$TMP/filtered_strains.txt"
+  $ rm -f "$TMP/filtered.fasta"
+
+Repeat without any sequence-based filters.
+Since we expect metadata to be filtered by presence of strains in input sequences, this should produce no results because the intersection of metadata and sequences is empty.
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --sequences "$TMP/dummy.fasta" \
+  >  --metadata filter/data/metadata.tsv \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
+  ERROR: All samples have been dropped! Check filter rules and metadata file format.
+  [2]
+  $ wc -l "$TMP/filtered_strains.txt"
+  \s*0 .* (re)
+  $ rm -f "$TMP/filtered_strains.txt"
diff --git a/tests/functional/filter/cram/filter-no-outputs-error.t b/tests/functional/filter/cram/filter-no-outputs-error.t
index 7eedec3e1..52e8866f5 100644
--- a/tests/functional/filter/cram/filter-no-outputs-error.t
+++ b/tests/functional/filter/cram/filter-no-outputs-error.t
@@ -5,9 +5,22 @@ Setup
 
 Try to filter without any outputs.
 
+Pandas engine
+-------------
+
   $ ${AUGUR} filter \
   >  --sequence-index filter/data/sequence_index.tsv \
   >  --metadata filter/data/metadata.tsv \
   >  --min-length 10000 > /dev/null
   ERROR: You need to select at least one output.
   [2]
+
+SQLite engine
+-------------
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --sequence-index filter/data/sequence_index.tsv \
+  >  --metadata filter/data/metadata.tsv \
+  >  --min-length 10000 > /dev/null
+  ERROR: You need to select at least one output.
+  [2]
diff --git a/tests/functional/filter/cram/filter-output-directory-not-found-error.t b/tests/functional/filter/cram/filter-output-directory-not-found-error.t
index 4fc4131cb..55cdfb70a 100644
--- a/tests/functional/filter/cram/filter-output-directory-not-found-error.t
+++ b/tests/functional/filter/cram/filter-output-directory-not-found-error.t
@@ -5,6 +5,9 @@ Setup
 
 Try to output to a directory that does not exist.
 
+Pandas engine
+-------------
+
   $ ${AUGUR} filter \
   >  --metadata filter/data/metadata.tsv \
   >  --group-by year month \
@@ -12,3 +15,14 @@ Try to output to a directory that does not exist.
   >  --output-strains "directory-does-not-exist/filtered_strains.txt" > /dev/null
   ERROR: No such file or directory: 'directory-does-not-exist/filtered_strains.txt'
   [2]
+
+SQLite engine
+-------------
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --metadata filter/data/metadata.tsv \
+  >  --group-by year month \
+  >  --sequences-per-group 1 \
+  >  --output-strains "directory-does-not-exist/filtered_strains.txt" > /dev/null
+  ERROR: No such file or directory: 'directory-does-not-exist/filtered_strains.txt'
+  [2]
diff --git a/tests/functional/filter/cram/filter-output-strains-no-sequence-error.t b/tests/functional/filter/cram/filter-output-strains-no-sequence-error.t
index 2648c18b4..fd67b0ccc 100644
--- a/tests/functional/filter/cram/filter-output-strains-no-sequence-error.t
+++ b/tests/functional/filter/cram/filter-output-strains-no-sequence-error.t
@@ -6,6 +6,9 @@ Setup
 Try to filter with sequence outputs and no sequence inputs.
 This should fail.
 
+Pandas engine
+-------------
+
   $ ${AUGUR} filter \
   >  --sequence-index filter/data/sequence_index.tsv \
   >  --metadata filter/data/metadata.tsv \
@@ -13,3 +16,14 @@ This should fail.
   >  --output "$TMP/filtered.fasta" > /dev/null
   ERROR: You need to provide sequences to output sequences.
   [2]
+
+SQLite engine
+-------------
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --sequence-index filter/data/sequence_index.tsv \
+  >  --metadata filter/data/metadata.tsv \
+  >  --min-length 10000 \
+  >  --output "$TMP/filtered.fasta" > /dev/null
+  ERROR: You need to provide sequences to output sequences.
+  [2]
diff --git a/tests/functional/filter/cram/filter-query-example.t b/tests/functional/filter/cram/filter-query-example.t
index bbcfb8bf1..8883aaf14 100644
--- a/tests/functional/filter/cram/filter-query-example.t
+++ b/tests/functional/filter/cram/filter-query-example.t
@@ -3,6 +3,9 @@ Setup
   $ pushd "$TESTDIR" > /dev/null
   $ source _setup.sh
 
+Pandas engine
+-------------
+
 Filter into two separate sets and then select sequences from the union of those sets.
 First, select strains from Brazil (there should be 1).
 
@@ -66,3 +69,70 @@ Alternately, exclude the sequences from Brazil and Colombia (N=4) and records wi
   $ grep "^>" "$TMP/filtered.fasta" | wc -l
   \s*7 (re)
   $ rm -f "$TMP/filtered.fasta"
+
+SQLite engine
+-------------
+
+Filter into two separate sets and then select sequences from the union of those sets.
+First, select strains from Brazil (there should be 1).
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --metadata filter/data/metadata.tsv \
+  >  --query "country == 'Brazil'" \
+  >  --output-strains "$TMP/filtered_strains.brazil.txt" > /dev/null
+  $ wc -l "$TMP/filtered_strains.brazil.txt"
+  \s*1 .* (re)
+
+Then, select strains from Colombia (there should be 3).
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --metadata filter/data/metadata.tsv \
+  >  --query "country == 'Colombia'" \
+  >  --output-strains "$TMP/filtered_strains.colombia.txt" > /dev/null
+  $ wc -l "$TMP/filtered_strains.colombia.txt"
+  \s*3 .* (re)
+
+Finally, exclude all sequences except those from the two sets of strains (there should be 4).
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --sequences filter/data/sequences.fasta \
+  >  --sequence-index filter/data/sequence_index.tsv \
+  >  --metadata filter/data/metadata.tsv \
+  >  --exclude-all \
+  >  --include "$TMP/filtered_strains.brazil.txt" "$TMP/filtered_strains.colombia.txt" \
+  >  --output "$TMP/filtered.fasta" > /dev/null
+  $ grep "^>" "$TMP/filtered.fasta" | wc -l
+  \s*4 (re)
+  $ rm -f "$TMP/filtered.fasta"
+
+Repeat this filter without a sequence index.
+We should get the same outputs without building a sequence index on the fly, because the exclude-all flag tells us we only want to force-include strains and skip all other filters.
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --sequences filter/data/sequences.fasta \
+  >  --metadata filter/data/metadata.tsv \
+  >  --exclude-all \
+  >  --include "$TMP/filtered_strains.brazil.txt" "$TMP/filtered_strains.colombia.txt" \
+  >  --output "$TMP/filtered.fasta" \
+  >  --output-metadata "$TMP/filtered.tsv" > /dev/null
+  $ grep "^>" "$TMP/filtered.fasta" | wc -l
+  \s*4 (re)
+  $ rm -f "$TMP/filtered.fasta"
+
+Metadata should have the same number of records as the sequences plus a header.
+
+  $ wc -l "$TMP/filtered.tsv"
+  \s*5 .* (re)
+  $ rm -f "$TMP/filtered.tsv"
+
+Alternately, exclude the sequences from Brazil and Colombia (N=4) and records without sequences (N=1) or metadata (N=1).
+
+  $ ${AUGUR} filter --engine sqlite \
+  >  --sequences filter/data/sequences.fasta \
+  >  --sequence-index filter/data/sequence_index.tsv \
+  >  --metadata filter/data/metadata.tsv \
+  >  --exclude "$TMP/filtered_strains.brazil.txt" "$TMP/filtered_strains.colombia.txt" \
+  >  --output "$TMP/filtered.fasta" > /dev/null
+  $ grep "^>" "$TMP/filtered.fasta" | wc -l
+  \s*7 (re)
+  $ rm -f "$TMP/filtered.fasta"