diff --git a/.github/workflows/latest.yaml b/.github/workflows/latest.yaml
index ff8cc5d..6c7caca 100644
--- a/.github/workflows/latest.yaml
+++ b/.github/workflows/latest.yaml
@@ -76,17 +76,17 @@ jobs:
- name: validate
run: |
- ${{ env.RUN_CMD }} run --dataset-dir dataset/sars-cov-2/latest --output-dir validate --populations "*" --threads 2
+ ${{ env.RUN_CMD }} run --dataset-dir dataset/sars-cov-2/latest --output-dir output/validate --populations "*" --threads 2
if [[ $(grep "fail" validate/linelist.tsv) ]]; then exit 1; fi
- name: plot
run: |
- ${{ env.RUN_CMD }} plot --dataset-dir dataset/sars-cov-2/latest --output-dir validate --plot-dir validate/plots
+ ${{ env.RUN_CMD }} plot --dataset-dir dataset/sars-cov-2/latest --run-dir output/validate
- name: upload
uses: actions/upload-artifact@v3
with:
name: validate-${{ matrix.arch }}
- path: validate
+ path: output/validate
if-no-files-found: error
retention-days: 7
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 459836d..da40178 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -105,17 +105,18 @@ jobs:
- name: example 1
run: |
${{ env.RUN_CMD }} run --dataset-dir dataset/sars-cov-2/${{ env.DATASET_TAG }} --output-dir output/example1 --populations "AY.4.2*,BA.5.2,XBC.1.6*,XBB.1.5.1,XBL"
- ${{ env.RUN_CMD }} plot --dataset-dir dataset/sars-cov-2/${{ env.DATASET_TAG }} --output-dir output/example1
+ ${{ env.RUN_CMD }} plot --dataset-dir dataset/sars-cov-2/${{ env.DATASET_TAG }} --run-dir output/example1
- name: example 2
run: |
${{ env.RUN_CMD }} run --dataset-dir dataset/sars-cov-2/${{ env.DATASET_TAG }} --output-dir output/example2 --alignment data/example2.fasta
- ${{ env.RUN_CMD }} plot --dataset-dir dataset/sars-cov-2/${{ env.DATASET_TAG }} --output-dir output/example2
+ ${{ env.RUN_CMD }} plot --dataset-dir dataset/sars-cov-2/${{ env.DATASET_TAG }} --run-dir output/example2
- name: toy1
run: |
+ ${{ env.RUN_CMD }} dataset download --output-dir dataset/toy1 --name toy1 --tag custom
${{ env.RUN_CMD }} run --dataset-dir dataset/toy1 --output-dir output/toy1 --populations "*" --mask 0,0 --min-length 1
- ${{ env.RUN_CMD }} plot --dataset-dir dataset/toy1 --output-dir output/toy1
+ ${{ env.RUN_CMD }} plot --dataset-dir dataset/toy1 --run-dir output/toy1
- name: upload output
uses: actions/upload-artifact@v3
diff --git a/Cargo.lock b/Cargo.lock
index fd8bb43..b0726dd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -568,12 +568,9 @@ dependencies = [
[[package]]
name = "fastrand"
-version = "1.9.0"
+version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be"
-dependencies = [
- "instant",
-]
+checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
[[package]]
name = "fdeflate"
@@ -920,17 +917,6 @@ dependencies = [
"cfg-if",
]
-[[package]]
-name = "io-lifetimes"
-version = "1.0.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220"
-dependencies = [
- "hermit-abi 0.3.1",
- "libc",
- "windows-sys 0.48.0",
-]
-
[[package]]
name = "ipnet"
version = "2.7.2"
@@ -944,7 +930,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
dependencies = [
"hermit-abi 0.3.1",
- "rustix 0.38.25",
+ "rustix",
"windows-sys 0.48.0",
]
@@ -1017,12 +1003,6 @@ version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4"
-[[package]]
-name = "linux-raw-sys"
-version = "0.3.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ece97ea872ece730aed82664c424eb4c8291e1ff2480247ccf7409044bc6479f"
-
[[package]]
name = "linux-raw-sys"
version = "0.4.11"
@@ -1262,9 +1242,9 @@ checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
[[package]]
name = "openssl"
-version = "0.10.59"
+version = "0.10.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a257ad03cd8fb16ad4172fedf8094451e1af1c4b70097636ef2eac9a5f0cc33"
+checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800"
dependencies = [
"bitflags 2.4.1",
"cfg-if",
@@ -1303,9 +1283,9 @@ dependencies = [
[[package]]
name = "openssl-sys"
-version = "0.9.95"
+version = "0.9.96"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40a4130519a360279579c2053038317e40eff64d13fd3f004f9e1b72b8a6aaf9"
+checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f"
dependencies = [
"cc",
"libc",
@@ -1530,7 +1510,7 @@ dependencies = [
[[package]]
name = "rebar"
-version = "0.1.0"
+version = "0.2.0"
dependencies = [
"bio",
"chrono",
@@ -1549,7 +1529,7 @@ dependencies = [
"raqote",
"rayon",
"reqwest",
- "rustix 0.38.25",
+ "rustix",
"rusttype",
"semver 1.0.19",
"serde",
@@ -1562,9 +1542,9 @@ dependencies = [
[[package]]
name = "redox_syscall"
-version = "0.3.5"
+version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
+checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
dependencies = [
"bitflags 1.3.2",
]
@@ -1650,20 +1630,6 @@ dependencies = [
"semver 0.1.20",
]
-[[package]]
-name = "rustix"
-version = "0.37.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d"
-dependencies = [
- "bitflags 1.3.2",
- "errno",
- "io-lifetimes",
- "libc",
- "linux-raw-sys 0.3.7",
- "windows-sys 0.48.0",
-]
-
[[package]]
name = "rustix"
version = "0.38.25"
@@ -1673,7 +1639,7 @@ dependencies = [
"bitflags 2.4.1",
"errno",
"libc",
- "linux-raw-sys 0.4.11",
+ "linux-raw-sys",
"windows-sys 0.48.0",
]
@@ -1929,15 +1895,15 @@ dependencies = [
[[package]]
name = "tempfile"
-version = "3.5.0"
+version = "3.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998"
+checksum = "7ef1adac450ad7f4b3c28589471ade84f25f731a7a0fe30d71dfa9f60fd808e5"
dependencies = [
"cfg-if",
"fastrand",
"redox_syscall",
- "rustix 0.37.19",
- "windows-sys 0.45.0",
+ "rustix",
+ "windows-sys 0.48.0",
]
[[package]]
diff --git a/Cargo.toml b/Cargo.toml
index b2c50fc..de42cdb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "rebar"
-version = "0.1.0"
+version = "0.2.0"
edition = "2021"
[dependencies]
@@ -15,7 +15,7 @@ indicatif = { version = "0.17.5", default-features = false, features = ["
indoc = { version = "2.0.4", default-features = false }
itertools = { version = "0.12.0", default-features = false, features = ["use_std"] }
log = { version = "0.4.17", default-features = false }
-openssl = { version = "0.10.59", default-features = false, features = ["vendored"]}
+openssl = { version = "0.10.60", default-features = false, features = ["vendored"]}
petgraph = { version = "0.6.3", default-features = false, features = ["serde-1"] }
raqote = { version = "0.8.2", default-features = false, features = ["png"]}
rand = { version = "0.8.5", default-features = false }
@@ -27,7 +27,7 @@ semver = { version = "1.0.19", default-features = false, features = ["
serde = { version = "1.0.163", default-features = false, features = ["derive"] }
serde_json = { version = "1.0.96", default-features = false }
strum = { version = "0.25", default-features = false, features = ["derive"] }
-tempfile = { version = "3.5.0", default-features = false }
+tempfile = { version = "3.8.1", default-features = false }
tokio = { version = "1.28.1", default-features = false, features = ["macros", "rt-multi-thread"] }
zstd = { version = "0.13.0", default-features = false }
diff --git a/README.md b/README.md
index 585f18d..a6c420d 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,13 @@ wget -O rebar https://github.com/phac-nml/rebar/releases/download/v0.1.3/rebar-x
## Usage
-1. Download a dataset, version-controlled to a specific date.
+1. Preview pre-built datasets.
+
+ ```bash
+ rebar dataset list
+ ```
+
+1. Download a pre-built dataset, version-controlled to a specific date (try any date!).
```bash
rebar dataset download --name sars-cov-2 --tag 2023-11-17 --output-dir dataset/sars-cov-2/2023-11-17
@@ -135,7 +141,7 @@ Special thanks go to the following people, who are instrumental to the design an
-Thanks go to the following people, who participated in the development of [ncov-recombinant](https://github.com/ktmeaton/ncov-recombinant), which `rebar` is based on:
+Thanks go to the following people, who participated in the development of `rebar` and [ncov-recombinant](https://github.com/ktmeaton/ncov-recombinant):
diff --git a/dataset/toy1/phylogeny.json b/dataset/toy1/phylogeny.json
deleted file mode 100644
index 85ec095..0000000
--- a/dataset/toy1/phylogeny.json
+++ /dev/null
@@ -1,50 +0,0 @@
-{
- "graph": {
- "nodes": [
- "root",
- "A",
- "B",
- "C"
- ],
- "node_holes": [],
- "edge_property": "directed",
- "edges": [
- [
- 0,
- 1,
- 1
- ],
- [
- 0,
- 2,
- 1
- ],
- [
- 1,
- 3,
- 1
- ],
- [
- 2,
- 3,
- 1
- ]
- ]
- },
- "order": [
- "A",
- "B",
- "C"
- ],
- "recombinants": [
- "C"
- ],
- "recombinants_all": [
- "C"
- ],
- "non_recombinants_all": [
- "root",
- "A",
- "B"
- ]
-}
diff --git a/dataset/toy1/populations.fasta b/dataset/toy1/populations.fasta
deleted file mode 100644
index 3104b25..0000000
--- a/dataset/toy1/populations.fasta
+++ /dev/null
@@ -1,6 +0,0 @@
->A
-CCCCCCCCCC
->B
-TTTTTTTTTT
->C
-CCCCCCTTTT
diff --git a/dataset/toy1/reference.fasta b/dataset/toy1/reference.fasta
deleted file mode 100644
index 8342883..0000000
--- a/dataset/toy1/reference.fasta
+++ /dev/null
@@ -1,2 +0,0 @@
->Reference
-AAAAAAAAAA
diff --git a/docs/dataset.md b/docs/dataset.md
new file mode 100644
index 0000000..7390dd5
--- /dev/null
+++ b/docs/dataset.md
@@ -0,0 +1,51 @@
+# Dataset
+
+A `rebar` dataset consists of two mandatory parts:
+
+1. `reference.fasta`: The reference genome.
+
+ ```text
+ >Reference
+ AAAAAAAAAAAAAAAAAAAA
+ ```
+
+1. `populations.fasta`: Known populations (ex. clades, lineages) aligned to the reference.
+
+ ```text
+ >A
+ CCCCCCAACCCCCCCCCCCC
+ >B
+ TTTTTTTTTTTTTTTTTTAA
+ >C
+ AAGGGGGGGGGGGGGGGGGG
+ >D
+ CCCCCCAACCCTTTTTTTAA
+ >E
+ AAGCCCAACCCTTTTTTTAA
+ ```
+
+The following are optional components:
+
+1. `phylogeny.json`: A phylogenetic graph which provides prior information about the evolutionary history. This is particularly useful if populations in `populations.fasta` are internal nodes or known recombinants.
+
+ ```json
+ {
+ "graph": {
+ "nodes": [ "root", "A", "B", "C"],
+ "edge_property": "directed",
+ "edges": [
+ [ 0, 1, 1],
+ [ 0, 2, 1],
+ [ 1, 3, 1],
+ [ 2, 3, 1]
+ ]
+ }
+ }
+ ```
+
+1. `annotations.tsv`: A table of genome annotations to add to the plot.
+
+ |gene |abbreviation|start|end|
+ |:----|:-----------|:----|:--|
+ |Gene1|g1 |1 |3 |
+ |Gene2|g2 |7 |10 |
diff --git a/docs/examples.md b/docs/examples.md
index 1d6d3c0..1f61f5a 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -10,13 +10,13 @@ The names of SARS-CoV-2 lineages as input.
rebar run \
--dataset-dir dataset/sars-cov-2/2023-11-17 \
--populations "AY.4.2*,BA.5.2,XBC.1.6*,XBB.1.5.1,XBL" \
- --output-dir output/example1
+ --output-dir output/example/population
```
1. Plot breakpoints and parental regions.
```bash
- rebar plot --dataset-dir dataset/sars-cov-2/2023-11-17 --output-dir output/example1
+ rebar plot --dataset-dir dataset/sars-cov-2/2023-11-17 --output-dir output/example/population
```
The populations (`--populations`) can include any sequence name found in the dataset's `populations.fasta`. For `sars-cov-2`, sequence names are the designated lineages. The wildcard character ("\*") will include the lineage and all its descendants. **NOTE**: If using "\*", make sure to use quotes (ex. `--lineages "XBC*,XBB.1.16*"`)!
@@ -37,11 +37,51 @@ An alignment of SARS-CoV-2 genomes as input.
rebar run \
--dataset dataset/sars-cov-2/2023-11-17 \
--alignment example2.fasta \
- --output-dir output/example2
+ --output-dir output/example/alignment
```
Please note that the `--alignment` should be aligned to the same reference as in the dataset `reference.fasta` (we strongly recommend [nextclade](https://clades.nextstrain.org/)).
+## Example 3 | Debug
+
+You can see the inner-workings of the `rebar` algorithm by using `--verbosity debug`. Let's test this on the SARS-CoV-2 recombinant `XCC` which has known parents `XBB.1.9.1` and `CH.1.1.1`.
+
+```bash
+rebar run \
+ --dataset-dir dataset/sars-cov-2/2023-11-17 \
+ --populations "XCC" \
+ --output-dir output/example/debug \
+ --verbosity debug
+```
+
+The debugging output will report detailed information on dataset searches for the primary parent (best match/conensus population). In addition, it will search for secondary parents (recombination) by testing four different recombination hypotheses:
+
+1. Non-Recombinant
+1. Designated Recombinant (using known parents from the dataset)
+1. Recursive Recombinant (allowing parents to be recombinants themselves)
+1. Non-Recursive Recombinant (not allowing parents to be recombinants)
+
+The best match/primary parent is found to be for `XCC` is... itself, excellent! `XCC` is a known recombinant, so that rules out **Hypothesis \#1**.
+
+Since `XCC` is a known recombinant, `rebar` will evaluate **Hypothesis 2**. The primary parent search will be redone focusing exclusively on designated parents (`XBB.1.9.1` and `CH.1.1.1`).
+
+search for a secondary parent in designated parents for **Hypothesis 2** (`XBB.1.9.1` and `CH.1.1.1`.)
+
+Since we're using a dataset population, `XCC` is an exact match to itself. So there are no mutational conflicts that need to be explained by recombination/secondary parents. No evidence for **Hypothesis #3** is found.
+
+`rebar` will then search for a secondary parent among all possible populations. However since `XCC` is an exact match to itself, there are no mutational conflicts to
+
+Since `XCC` is a known recombinant, `rebar` will search for a secondary parent in designated parents for **Hypothesis 2** (`XBB.1.9.1` and `CH.1.1.1`.)
+`rebar` will only find evidence to support Hypotheses \#2 (designated) and \#4 (recursive).
+
+```text
+Hypotheses: DesignatedRecombinant: score=66, conflict=3, RecursiveRecombinant: score=66, conflict=3
+```
+
+Hypotheses: DesignatedRecombinant: score=66, conflict=3, RecursiveRecombinant: score=66, conflict=3
+
+DesignatedRecombinant: score=78, conflict=0, NonRecursiveRecombinant: score=49, conflict=6
+
## Example 3 | Knockout
You can perform a 'knockout' experiment to remove populations from the dataset. For example, we might be interested in what the SARS-CoV-2 recombinant `XBB` would have been classified as _before_ it became a designated lineage.
@@ -53,7 +93,7 @@ You can perform a 'knockout' experiment to remove populations from the dataset.
--dataset-dir dataset/sars-cov-2/2023-11-17 \
--populations "XBB" \
--knockout "XBB" \
- --output-dir output/example3
+ --output-dir output/example/knockout
```
1. Examine the linelist (`output/example3/linelist.tsv`).
@@ -67,6 +107,22 @@ You can perform a 'knockout' experiment to remove populations from the dataset.
- In addition, it has one "private" substitutions that is not found in either parent: `A19326G`.
- This can be used to contribute evidence for a new lineage proposal in the [pango-designation](https://github.com/cov-lineages/pango-designation/issues) respository.
+## Example 4 | Parents
+
+By default, `rebar` will consider all populations in the dataset as possible parents. If you would like to see the evidence for a specific hypothesis, you can restrict the parent search with `--parents`.
+
+1. Detect recombination.
+
+ ```bash
+ rebar run \
+ --dataset-dir dataset/sars-cov-2/2023-11-17 \
+ --populations "XD" \
+ --output-dir output/example/parents \
+ --verbosity debug
+ ```
+
+["B.1.617.2*","BA.1*"]
+
## Example 4 | Validate
Run `rebar` on all populations in the dataset, and validate against the expected results.
diff --git a/docs/run.md b/docs/run.md
new file mode 100644
index 0000000..29de756
--- /dev/null
+++ b/docs/run.md
@@ -0,0 +1,7 @@
+# Run
+
+> **Tip**: The inner workings of the `rebar` algorithm can be exposed by including the `--verbosity debug` flag!
+
+`rebar` begins by comparing a query sequence to the dataset populations in order to find its best match. The best match is simplify defined as the population with the greatest number of shared mutations, and the least number of conflicting bases. Sites with missing data ("N") and deletions ("-") are ignored in this calculation. The best match represents the primary parent of the query sequence.
+
+`rebar` then proceeds to search for
diff --git a/src/cli/plot.rs b/src/cli/plot.rs
index 17bed0d..bd62add 100644
--- a/src/cli/plot.rs
+++ b/src/cli/plot.rs
@@ -12,8 +12,8 @@ pub struct Args {
/// Output directory from rebar run.
///
/// Will plot all TSV files under barcodes/
- #[clap(short = 'o', long, required = true)]
- pub output_dir: PathBuf,
+ #[clap(short = 'r', long, required = true)]
+ pub run_dir: PathBuf,
/// A single barcodes TSV file to plot.
#[clap(short = 'b', long)]
@@ -21,11 +21,24 @@ pub struct Args {
/// Output directory for plots.
///
- /// Otherwise will default to 'plots' under the --output-dir
- #[clap(short = 'p', long)]
- pub plot_dir: Option,
+ /// Otherwise will default to 'plots/' under the --run-dir
+ #[clap(short = 'o', long)]
+ pub output_dir: Option,
+}
+
+impl Default for Args {
+ fn default() -> Self {
+ Self::new()
+ }
+}
- /// Directory to download fonts to.
- #[clap(short = 'f', long, default_value = PathBuf::from(".cache/fonts").into_os_string())]
- pub font_cache: PathBuf,
+impl Args {
+ pub fn new() -> Self {
+ Args {
+ dataset_dir: PathBuf::new(),
+ run_dir: PathBuf::new(),
+ barcodes_file: None,
+ output_dir: None,
+ }
+ }
}
diff --git a/src/dataset/attributes.rs b/src/dataset/attributes.rs
index c194c1c..e7ddba0 100644
--- a/src/dataset/attributes.rs
+++ b/src/dataset/attributes.rs
@@ -24,6 +24,9 @@ pub enum Name {
#[serde(rename = "sars-cov-2")]
#[strum(props(implemented = "true"))]
SarsCov2,
+ #[serde(rename = "toy1")]
+ #[strum(props(implemented = "true"))]
+ Toy1,
#[serde(rename = "rsv-a")]
#[strum(props(implemented = "false"))]
RsvA,
@@ -45,6 +48,7 @@ impl Name {
compatibility.dataset.min_date =
Some(NaiveDate::parse_from_str("2023-02-09", "%Y-%m-%d")?);
}
+ Name::Toy1 => compatibility.cli.version = Some(">=0.2.0".to_string()),
_ => compatibility.cli.version = Some(">=1.0.0".to_string()),
}
Ok(compatibility)
@@ -55,6 +59,7 @@ impl fmt::Display for Name {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let name = match self {
Name::SarsCov2 => String::from("sars-cov-2"),
+ Name::Toy1 => String::from("toy1"),
Name::RsvA => String::from("rsv-a"),
Name::RsvB => String::from("rsv-b"),
Name::Custom => String::from("custom"),
@@ -70,6 +75,7 @@ impl FromStr for Name {
fn from_str(name: &str) -> Result {
let name = match name {
"sars-cov-2" => Name::SarsCov2,
+ "toy1" => Name::Toy1,
"rsv-a" => Name::RsvA,
"rsv-b" => Name::RsvB,
"custom" => Name::Custom,
diff --git a/src/dataset/download.rs b/src/dataset/download.rs
index 1ca24f6..b19cbc9 100644
--- a/src/dataset/download.rs
+++ b/src/dataset/download.rs
@@ -1,6 +1,7 @@
use crate::cli;
use crate::dataset;
use crate::dataset::attributes::{check_compatibility, Name, Summary};
+use crate::dataset::{sarscov2, toy1};
use crate::{utils, utils::remote_file::RemoteFile};
//use crate::sequence::Substitution;
use color_eyre::eyre::{eyre, Report, Result};
@@ -70,8 +71,9 @@ pub async fn dataset(args: &mut cli::dataset::download::Args) -> Result<(), Repo
} else {
match args.name {
Name::SarsCov2 => {
- dataset::sarscov2::download::reference(&args.tag, &output_path).await?
+ sarscov2::download::reference(&args.tag, &output_path).await?
}
+ Name::Toy1 => toy1::download::reference(&args.tag, &output_path)?,
_ => todo!(),
}
};
@@ -87,8 +89,9 @@ pub async fn dataset(args: &mut cli::dataset::download::Args) -> Result<(), Repo
} else {
match args.name {
Name::SarsCov2 => {
- dataset::sarscov2::download::populations(&args.tag, &output_path).await?
+ sarscov2::download::populations(&args.tag, &output_path).await?
}
+ Name::Toy1 => toy1::download::populations(&args.tag, &output_path)?,
_ => todo!(),
}
};
@@ -100,7 +103,8 @@ pub async fn dataset(args: &mut cli::dataset::download::Args) -> Result<(), Repo
info!("Creating annotations: {output_path:?}");
let annotations = match args.name {
- Name::SarsCov2 => dataset::sarscov2::annotations::build()?,
+ Name::SarsCov2 => sarscov2::annotations::build()?,
+ Name::Toy1 => toy1::annotations::build()?,
_ => todo!(),
};
annotations.write(&output_path)?;
@@ -113,8 +117,9 @@ pub async fn dataset(args: &mut cli::dataset::download::Args) -> Result<(), Repo
let phylogeny = match args.name {
Name::SarsCov2 => {
- dataset::sarscov2::phylogeny::build(&mut summary, &args.output_dir).await?
+ sarscov2::phylogeny::build(&mut summary, &args.output_dir).await?
}
+ Name::Toy1 => toy1::phylogeny::build()?,
_ => todo!(),
};
phylogeny.write(&output_path)?;
@@ -147,6 +152,7 @@ pub async fn dataset(args: &mut cli::dataset::download::Args) -> Result<(), Repo
let mut edge_cases = match args.name {
Name::SarsCov2 => dataset::sarscov2::edge_cases::default()?,
+ Name::Toy1 => dataset::toy1::edge_cases::default()?,
_ => todo!(),
};
let manual_populations =
diff --git a/src/dataset/list.rs b/src/dataset/list.rs
index b35d6ba..b800cfe 100644
--- a/src/dataset/list.rs
+++ b/src/dataset/list.rs
@@ -6,7 +6,7 @@ use itertools::Itertools;
use strum::{EnumProperty, IntoEnumIterator};
/// List datasets
-pub async fn datasets(args: &cli::dataset::list::Args) -> Result<(), Report> {
+pub fn datasets(args: &cli::dataset::list::Args) -> Result<(), Report> {
// table of name, tag, cli_version
let mut table = Table::new();
table.headers = vec![
@@ -39,7 +39,7 @@ pub async fn datasets(args: &cli::dataset::list::Args) -> Result<(), Report> {
let min_date = if let Some(min_date) = compatibility.dataset.min_date {
min_date.format("%Y-%m-%d").to_string()
} else {
- String::new()
+ "latest".to_string()
};
let max_date = if let Some(max_date) = compatibility.dataset.max_date {
max_date.format("%Y-%m-%d").to_string()
diff --git a/src/dataset/load.rs b/src/dataset/load.rs
index 5beab34..c5f9f3e 100644
--- a/src/dataset/load.rs
+++ b/src/dataset/load.rs
@@ -90,7 +90,7 @@ pub fn parse_populations(
// read in populations from fasta
let populations_reader = fasta::Reader::from_file(populations_path)
.map_err(|e| eyre!(e))
- .wrap_err("Failed to read file: {populations_path:?}")?;
+ .wrap_err(format!("Failed to read file: {populations_path:?}"))?;
// read in reference from fasta
let reference = read_reference(reference_path, mask)?;
diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs
index 2e4e43a..eae93c0 100644
--- a/src/dataset/mod.rs
+++ b/src/dataset/mod.rs
@@ -214,7 +214,9 @@ impl Dataset {
let pop_seq = &self.populations[pop];
let summary =
parsimony::Summary::from_sequence(sequence, pop_seq, coordinates)
- .unwrap();
+ .unwrap_or_else(|_| {
+ panic!("Failed to create summary from sequence {}", &sequence.id)
+ });
result.support.insert(pop.to_owned(), summary.support);
result.conflict_ref.insert(pop.to_owned(), summary.conflict_ref);
result.conflict_alt.insert(pop.to_owned(), summary.conflict_alt);
@@ -229,7 +231,8 @@ impl Dataset {
// which population(s) has the highest score?
// reminder: it can be negative when extreme recombinant genomic size
- let max_score = result.score.values().max().unwrap();
+ let max_score =
+ result.score.values().max().expect("Failed to get max score of result.");
let max_score_populations = result
.score
@@ -270,32 +273,38 @@ impl Dataset {
result.consensus_population = consensus_population.clone();
// if the common_ancestor was not in the populations list, add it
- let consensus_sequence =
- if !result.top_populations.contains(&consensus_population) {
- let pop = &consensus_population;
-
- // // Option #1. Actual sequence of the internal MRCA node?
- // let pop_seq = &self.populations[pop];
- // let summary = parsimony::Summary::from_sequence(sequence, pop_seq, coordinates)?;
-
- // Option #2. Consensus sequence of top populations?
- let top_populations =
- result.top_populations.iter().map(|s| s.as_ref()).collect_vec();
- debug!("Creating {pop} consensus genome from top populations.");
- let pop_seq = self.create_consensus(pop, &top_populations)?;
- let summary =
- parsimony::Summary::from_sequence(sequence, &pop_seq, coordinates)?;
-
- // Add consensus summary to search result
- result.support.insert(pop.to_owned(), summary.support);
- result.conflict_ref.insert(pop.to_owned(), summary.conflict_ref);
- result.conflict_alt.insert(pop.to_owned(), summary.conflict_alt);
- result.score.insert(pop.to_owned(), summary.score);
-
- pop_seq
- } else {
- self.populations[&consensus_population].clone()
- };
+ let consensus_sequence = if !result
+ .top_populations
+ .contains(&consensus_population)
+ {
+ let pop = &consensus_population;
+
+ // // Option #1. Actual sequence of the internal MRCA node?
+ // let pop_seq = &self.populations[pop];
+ // let summary = parsimony::Summary::from_sequence(sequence, pop_seq, coordinates)?;
+
+ // Option #2. Consensus sequence of top populations?
+ let top_populations =
+ result.top_populations.iter().map(|s| s.as_ref()).collect_vec();
+ debug!("Creating {pop} consensus genome from top populations.");
+ let pop_seq = self.create_consensus(pop, &top_populations)?;
+ let summary =
+ parsimony::Summary::from_sequence(sequence, &pop_seq, coordinates)?;
+
+ // Add consensus summary to search result
+ result.support.insert(pop.to_owned(), summary.support);
+ result.conflict_ref.insert(pop.to_owned(), summary.conflict_ref);
+ result.conflict_alt.insert(pop.to_owned(), summary.conflict_alt);
+ result.score.insert(pop.to_owned(), summary.score);
+
+ pop_seq
+ } else {
+ self
+ .populations
+ .get(&consensus_population)
+ .cloned()
+ .unwrap_or_else(|| panic!("Consensus population {consensus_population} is not in the dataset populations."))
+ };
// Filter out non-top populations
// helps cut down on verbosity in debug log and data stored
@@ -335,12 +344,19 @@ impl Dataset {
.collect_vec();
// private subs (conflict_alt and conflict_ref reversed)
- result.private = result.conflict_alt[&consensus_population].clone();
- result.conflict_ref[&consensus_population].iter().for_each(|sub| {
- let mut sub = *sub;
- std::mem::swap(&mut sub.alt, &mut sub.reference);
- result.private.push(sub);
- });
+ result.private =
+ result.conflict_alt.get(&consensus_population).cloned().unwrap_or_default();
+ result
+ .conflict_ref
+ .get(&consensus_population)
+ .cloned()
+ .unwrap_or_default()
+ .iter()
+ .for_each(|sub| {
+ let mut sub = *sub;
+ std::mem::swap(&mut sub.alt, &mut sub.reference);
+ result.private.push(sub);
+ });
result.private.sort();
debug!("Search Result:\n{}", result.pretty_print());
@@ -395,8 +411,8 @@ impl SearchResult {
let consensus_score: (String, isize) = score_order
.iter()
.find(|(pop, _score)| *pop == self.consensus_population)
- .unwrap()
- .clone();
+ .cloned()
+ .expect("Failed to order consensus populations by score.");
score_order.retain(|(pop, _score)| *pop != self.consensus_population);
score_order.insert(0, consensus_score);
diff --git a/src/dataset/sarscov2/edge_cases.rs b/src/dataset/sarscov2/edge_cases.rs
index 705264c..6b35452 100644
--- a/src/dataset/sarscov2/edge_cases.rs
+++ b/src/dataset/sarscov2/edge_cases.rs
@@ -1,29 +1,11 @@
use crate::cli::run;
-//use crate::phylogeny::Phylogeny;
use color_eyre::eyre::{Report, Result};
-//use itertools::Itertools;
use log::debug;
/// Create default SARS-CoV-2 recombinant edge cases.
pub fn default() -> Result, Report> {
let mut edge_cases: Vec = Vec::new();
- // // --------------------------------------------------------------------
- // // Designated Recombinants
-
- // let manual = vec!["XCF", "XCG"].into_iter().map(String::from).collect_vec();
-
- // for recombinant in &phylogeny.recombinants {
- // if manual.contains(recombinant) {
- // continue
- // }
- // debug!("Creating auto edge case: {recombinant}");
- // let mut edge_case = run::Args::default();
- // edge_case.population = Some(recombinant.to_string());
- // edge_case.parents = Some(phylogeny.get_parents(recombinant)?);
- // edge_cases.push(edge_case);
- // }
-
// --------------------------------------------------------------------
// Manual
diff --git a/src/dataset/sarscov2/phylogeny.rs b/src/dataset/sarscov2/phylogeny.rs
index e19262b..817adba 100644
--- a/src/dataset/sarscov2/phylogeny.rs
+++ b/src/dataset/sarscov2/phylogeny.rs
@@ -151,8 +151,8 @@ pub async fn build(
// Parent Child Relationships
// ------------------------------------------------------------------------
- let mut graph_data: BTreeMap> = BTreeMap::new();
- let mut graph_order: Vec = Vec::new();
+ let mut graph_order = Vec::new();
+ let mut graph_data = BTreeMap::new();
for row in lineage_table.rows {
let lineage = row[lineage_col_i].to_string();
@@ -162,11 +162,6 @@ pub async fn build(
continue;
}
- // warn if a lineage has notes but no sequence.
- // this might be because there are insufficient (<=3) sequences
- // available in open data repositories (ex. Genbank)
- // ex. XCU on 2023-11-16
-
let parents = get_lineage_parents(&lineage, &alias_key)?;
graph_order.push(lineage.clone());
graph_data.insert(lineage, parents);
@@ -177,7 +172,6 @@ pub async fn build(
// ------------------------------------------------------------------------
let mut phylogeny = Phylogeny::new();
- phylogeny.order = graph_order;
// Add root node
let name = "root".to_string();
@@ -186,9 +180,18 @@ pub async fn build(
// todo!() Do this twice? in case lineages are accidentally out of order?
// Add descendants
- for name in &phylogeny.order {
+ for name in graph_order {
let id = phylogeny.graph.add_node(name.clone());
- let parents = &graph_data[&name.clone()];
+ if !graph_data.contains_key(&name) {
+ return Err(
+ eyre!("Parents of {name} are unknown in the phylogeny graph.")
+ .suggestion(
+ "Could the lineage_notes be out of order or misformatted?",
+ )
+ .suggestion("Parents are required to appear before children."),
+ );
+ }
+ let parents = graph_data.get(&name).unwrap();
debug!("Population: {name}; Parents: {parents:?}");
@@ -280,7 +283,7 @@ pub fn compress_lineage(
let compress_subset = compress_parts[0..i].join(".");
if alias_key_rev.contains_key(&compress_subset) {
- compress = alias_key_rev[&compress_subset].clone();
+ compress = alias_key_rev.get(&compress_subset).unwrap().clone();
// Get the suffix that was chopped off in subset
let compress_suffix = &compress_parts[i..];
@@ -326,15 +329,13 @@ pub fn decompress_lineage(
let lineage_suffix = lineage_parts[1..lineage_level].join(".");
// Decompressing logic
- if alias_key.contains_key(&lineage_prefix) {
- let lineage_paths = &alias_key[&lineage_prefix];
- // Not multiple recombinant parents
- if lineage_paths.len() == 1 {
- decompress = lineage_paths[0].clone();
- // Add back our suffix numbers
- if lineage_level > 1 {
- decompress = format!("{decompress}.{lineage_suffix}");
- }
+ let lineage_paths = alias_key.get(&lineage_prefix).cloned().unwrap_or_default();
+ // Not multiple recombinant parents
+ if lineage_paths.len() == 1 {
+ decompress = lineage_paths[0].clone();
+ // Add back our suffix numbers
+ if lineage_level > 1 {
+ decompress = format!("{decompress}.{lineage_suffix}");
}
}
diff --git a/src/dataset/toy1/annotations.rs b/src/dataset/toy1/annotations.rs
new file mode 100644
index 0000000..91c79f8
--- /dev/null
+++ b/src/dataset/toy1/annotations.rs
@@ -0,0 +1,23 @@
+use crate::utils::table::Table;
+use color_eyre::eyre::{Report, Result};
+use itertools::Itertools;
+
+/// Create Toy1 genome annotations.
+pub fn build() -> Result {
+ let mut table = Table::new();
+
+ let headers = vec!["gene", "abbreviation", "start", "end"];
+ let rows = vec![
+ vec!["Gene1", "g1", "1", "3"],
+ vec!["Gene2", "g2", "12", "20"],
+ ];
+
+ // Convert values to String
+ table.headers = headers.into_iter().map(String::from).collect_vec();
+ table.rows = rows
+ .into_iter()
+ .map(|row| row.into_iter().map(String::from).collect_vec())
+ .collect_vec();
+
+ Ok(table)
+}
diff --git a/src/dataset/toy1/download.rs b/src/dataset/toy1/download.rs
new file mode 100644
index 0000000..dafe129
--- /dev/null
+++ b/src/dataset/toy1/download.rs
@@ -0,0 +1,62 @@
+use crate::dataset::attributes::Tag;
+use crate::utils::remote_file::RemoteFile;
+use chrono::Local;
+use color_eyre::eyre::{Report, Result, WrapErr};
+use indoc::formatdoc;
+use std::fs::File;
+use std::io::Write;
+use std::path::Path;
+
+/// Create and write Toy1 reference sequence.
+pub fn reference(_tag: &Tag, path: &Path) -> Result {
+ let sequences = formatdoc!(
+ "
+ >Reference
+ AAAAAAAAAAAAAAAAAAAA
+ "
+ );
+
+ let mut file = File::create(path)
+ .wrap_err_with(|| format!("Unable to create file: {path:?}"))?;
+ file.write_all(sequences.as_bytes())
+ .wrap_err_with(|| format!("Unable to write file: {path:?}"))?;
+
+ let remote_file = RemoteFile {
+ local_path: path.to_owned(),
+ date_created: Local::now().into(),
+ ..Default::default()
+ };
+
+ Ok(remote_file)
+}
+
+/// Create and write Toy1 populations sequence.
+pub fn populations(_tag: &Tag, path: &Path) -> Result {
+ let sequences = formatdoc!(
+ "
+ >A
+ CCCCCCAACCCCCCCCCCCC
+ >B
+ TTTTTTTTTTTTTTTTTTAA
+ >C
+ AAGGGGGGGGGGGGGGGGGG
+ >D
+ CCCCCCAACCCTTTTTTTAA
+ >E
+ AAGCCCAACCCTTTTTTTAA
+ "
+ );
+
+ let mut file = File::create(path)
+ .wrap_err_with(|| format!("Unable to create file: {path:?}"))?;
+ file.write_all(sequences.as_bytes())
+ .wrap_err_with(|| format!("Unable to write file: {path:?}"))?;
+
+ let remote_file = RemoteFile {
+ local_path: path.to_owned(),
+ date_created: Local::now().into(),
+ ..Default::default()
+ };
+
+ Ok(remote_file)
+}
diff --git a/src/dataset/toy1/edge_cases.rs b/src/dataset/toy1/edge_cases.rs
new file mode 100644
index 0000000..dde3f72
--- /dev/null
+++ b/src/dataset/toy1/edge_cases.rs
@@ -0,0 +1,8 @@
+use crate::cli::run;
+use color_eyre::eyre::{Report, Result};
+
+/// Create default Toy1 recombinant edge cases.
+pub fn default() -> Result, Report> {
+ let edge_cases: Vec = Vec::new();
+ Ok(edge_cases)
+}
diff --git a/src/dataset/toy1/mod.rs b/src/dataset/toy1/mod.rs
index 2925e81..f8bf309 100644
--- a/src/dataset/toy1/mod.rs
+++ b/src/dataset/toy1/mod.rs
@@ -1 +1,4 @@
+pub mod annotations;
+pub mod download;
+pub mod edge_cases;
pub mod phylogeny;
diff --git a/src/dataset/toy1/phylogeny.rs b/src/dataset/toy1/phylogeny.rs
index 7d92277..3c9ab82 100644
--- a/src/dataset/toy1/phylogeny.rs
+++ b/src/dataset/toy1/phylogeny.rs
@@ -1,38 +1,39 @@
use crate::phylogeny::Phylogeny;
use color_eyre::eyre::{Report, Result};
-use itertools::Itertools;
-use std::path::PathBuf;
pub fn build() -> Result {
let mut phylogeny = Phylogeny::new();
- phylogeny.order = vec!["A", "B", "C"].into_iter().map(String::from).collect_vec();
// Add root node
let name = "root".to_string();
- phylogeny.graph.add_node(name.clone());
- let root_id = phylogeny.get_node(&name)?;
+ let root_id = phylogeny.graph.add_node(name.clone());
+ // Add A node
let name = "A".to_string();
- phylogeny.graph.add_node(name.clone());
- let a_id = phylogeny.get_node(&name)?;
+ let a_id = phylogeny.graph.add_node(name.clone());
phylogeny.graph.add_edge(root_id, a_id, 1);
+ // Add B node
let name = "B".to_string();
- phylogeny.graph.add_node(name.clone());
- let b_id = phylogeny.get_node(&name)?;
+ let b_id = phylogeny.graph.add_node(name.clone());
phylogeny.graph.add_edge(root_id, b_id, 1);
+ // Add C node
let name = "C".to_string();
- phylogeny.graph.add_node(name.clone());
- let c_id = phylogeny.get_node(&name)?;
- phylogeny.graph.add_edge(a_id, c_id, 1);
- phylogeny.graph.add_edge(b_id, c_id, 1);
- phylogeny.recombinants.push(name.clone());
-
- phylogeny.recombinants_all = phylogeny.get_recombinants_all()?;
-
- let output_path = PathBuf::from("phylogeny.json");
- phylogeny.write(&output_path)?;
+ let c_id = phylogeny.graph.add_node(name.clone());
+ phylogeny.graph.add_edge(root_id, c_id, 1);
+
+ // Add recombinant D node
+ let name = "D".to_string();
+ let d_id = phylogeny.graph.add_node(name.clone());
+ phylogeny.graph.add_edge(a_id, d_id, 1);
+ phylogeny.graph.add_edge(b_id, d_id, 1);
+
+ // Add recursive recombinant E node
+ let name = "E".to_string();
+ let e_id = phylogeny.graph.add_node(name.clone());
+ phylogeny.graph.add_edge(d_id, e_id, 1);
+ phylogeny.graph.add_edge(c_id, e_id, 1);
Ok(phylogeny)
}
diff --git a/src/export/mod.rs b/src/export/mod.rs
index 448e00b..71dddf5 100644
--- a/src/export/mod.rs
+++ b/src/export/mod.rs
@@ -1,8 +1,9 @@
use crate::dataset::{Dataset, SearchResult};
use crate::recombination::{validate, Recombination};
use crate::utils;
-use color_eyre::eyre::{eyre, Report, Result};
+use color_eyre::eyre::{Report, Result};
use itertools::Itertools;
+use log::warn;
// ----------------------------------------------------------------------------
// LineList
@@ -36,15 +37,6 @@ pub fn linelist(
// iterate in parallel, checking for same sequence id
for (best_match, recombination) in results {
- // check that they're in the correct order
- if recombination.sequence.id != best_match.sequence_id {
- return Err(eyre!(
- "Recombination ID {} is not the same as Best Match ID: {}",
- recombination.sequence.id,
- best_match.sequence_id,
- ));
- }
-
// initialize the table row
let mut row = vec![String::new(); table.headers.len()];
@@ -119,8 +111,22 @@ pub fn linelist(
subs_by_origin.push(subs_format)
}
});
- } else {
- let support = &best_match.support[&best_match.consensus_population];
+ }
+ // I've got this error before, not sure in what cases it happens, maybe
+ // sars-cov-2 Pop B?
+ else {
+ if !best_match.support.contains_key(&best_match.consensus_population) {
+ warn!(
+ "Sequence {:?} has no support recorded for it's consensus population {:?}",
+ &best_match.sequence_id,
+ &best_match.consensus_population,
+ );
+ }
+ let support = best_match
+ .support
+ .get(&best_match.consensus_population)
+ .cloned()
+ .unwrap_or_default();
if !support.is_empty() {
let subs_format = format!(
diff --git a/src/lib.rs b/src/lib.rs
index 8d3d2f8..f5f6200 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -4,551 +4,7 @@ pub mod export;
pub mod phylogeny;
pub mod plot;
pub mod recombination;
+pub mod run;
pub mod sequence;
-#[cfg(test)]
-pub mod test;
+pub mod simulate;
pub mod utils;
-
-use crate::dataset::attributes::Name;
-use crate::dataset::SearchResult;
-use crate::recombination::Recombination;
-use crate::sequence::Sequence;
-use bio::io::fasta;
-use color_eyre::eyre::{eyre, Report, Result, WrapErr};
-use color_eyre::Help;
-use indicatif::{style::ProgressStyle, ProgressBar};
-use itertools::Itertools;
-use log::{debug, info, warn};
-use rand::Rng;
-use rayon::prelude::*;
-use std::fs::{create_dir_all, File};
-use std::io::Write;
-
-/// Download rebar dataset
-pub async fn download_dataset(
- args: &mut cli::dataset::download::Args,
-) -> Result<(), Report> {
- dataset::download::dataset(args).await?;
- Ok(())
-}
-
-/// List rebar datasets
-pub async fn list_datasets(args: &cli::dataset::list::Args) -> Result<(), Report> {
- dataset::list::datasets(args).await?;
- Ok(())
-}
-
-/// Simulate recombination.
-pub fn simulate(args: &cli::simulate::Args) -> Result<(), Report> {
- // create output directory if it doesn't exist
- if !args.output_dir.exists() {
- info!("Creating output directory: {:?}", args.output_dir);
- create_dir_all(&args.output_dir)?;
- }
-
- // Load dataset, disable masking
- info!("Loading dataset: {:?}", &args.dataset_dir);
- let mask = vec![0, 0];
- let dataset = dataset::load::dataset(&args.dataset_dir, &mask)?;
- let genome_length = dataset.reference.genome_length;
-
- // Check to make sure all parents are in dataset
- let parents = args.parents.clone();
- for parent in &parents {
- if !dataset.populations.contains_key(parent.as_str()) {
- return Err(eyre!(
- "Parent {parent} is not the dataset populations fasta."
- ));
- }
- }
-
- // ------------------------------------------------------------------------
- // Breakpoints
-
- let mut breakpoints = if let Some(breakpoints) = &args.breakpoints {
- info!("Using manual breakpoints: {breakpoints:?}");
- breakpoints.clone()
- } else {
- let mut breakpoints = Vec::new();
- let mut num_breakpoints_remaining = parents.len() - 1;
- let mut start = 1;
- let mut rng = rand::thread_rng();
- while num_breakpoints_remaining > 0 {
- // save some coordinates for future breakpoints
- let end = genome_length - num_breakpoints_remaining;
- let coord = rng.gen_range(start..end);
- breakpoints.push(coord);
- start = coord + 1;
- num_breakpoints_remaining -= 1;
- }
- info!("Using random breakpoints: {breakpoints:?}");
- breakpoints
- };
-
- let unique_key = format!(
- "simulate_{}_{}",
- &parents.iter().join("_"),
- &breakpoints.iter().map(|start| format!("{start}-{}", start + 1)).join("_"),
- );
- info!("Unique Key: {unique_key:?}");
-
- // ------------------------------------------------------------------------
- // Regions
-
- breakpoints.push(genome_length);
- let mut regions = Vec::new();
- let mut start = 1;
-
- for (origin, end) in parents.into_iter().zip(breakpoints.into_iter()) {
- let region = recombination::Region {
- start,
- end,
- origin,
- substitutions: Vec::new(),
- };
- regions.push(region);
- start = end + 1;
- }
- debug!("Regions: {regions:?}");
-
- // ------------------------------------------------------------------------
- // Sequences
-
- let sequence: String = regions
- .iter()
- .map(|region| {
- let sequence = dataset.populations.get(®ion.origin).unwrap();
- // Reminder, -1 to coordinates since they are 1-based
- sequence.seq[region.start - 1..=region.end - 1].iter().collect::()
- })
- .collect();
-
- let output_path = args.output_dir.join(format!("{unique_key}.fasta"));
- info!("Exporting fasta: {output_path:?}");
- let mut output_file = File::create(&output_path)
- .wrap_err_with(|| format!("Unable to create file: {output_path:?}"))?;
- let lines = format!(">{unique_key}\n{sequence}");
- output_file
- .write_all(lines.as_bytes())
- .wrap_err_with(|| format!("Unable to write file: {output_path:?}"))?;
-
- Ok(())
-}
-
-/// Run rebar on input alignment and/or dataset population(s)
-pub fn run(args: &mut cli::run::Args) -> Result<(), Report> {
- // copy args for export/seralizing
- let args_export = args.clone();
-
- // create output directory if it doesn't exist
- if !args.output_dir.exists() {
- info!("Creating output directory: {:?}", args.output_dir);
- create_dir_all(&args.output_dir)?;
- }
- // make sure output directory is empty!
- else {
- let output_dir_is_empty = args.output_dir.read_dir()?.next().is_none();
- if !output_dir_is_empty {
- return Err(eyre!(
- "--output-dir {:?} already exists and is not empty!",
- args.output_dir
- )
- .suggestion("Please change your --output-dir to a new or empty directory."));
- }
- }
-
- // check how many threads are available on the system
- let default_thread_pool = rayon::ThreadPoolBuilder::new().build().unwrap();
- info!(
- "Number of threads available: {}",
- default_thread_pool.current_num_threads()
- );
-
- // warn the user if they requested more than their system has available
- // if so, default to the system threads
- let mut num_threads = args.threads;
- if args.threads > default_thread_pool.current_num_threads() {
- warn!(
- "Requested --threads {} is greater than the available threads.",
- args.threads
- );
- num_threads = default_thread_pool.current_num_threads();
- }
-
- // configure the global thread pool
- info!("Using {} thread(s).", num_threads);
- rayon::ThreadPoolBuilder::new().num_threads(num_threads).build_global().unwrap();
-
- // configure progress bar style
- let progress_bar_style = ProgressStyle::with_template(
- "{bar:40} {pos}/{len} ({percent}%) | Sequences / Second: {per_sec} | Elapsed: {elapsed_precise} | ETA: {eta_precise}"
- ).unwrap();
-
- // Collect files in dataset_dir into a dataset object
- // This mainly includes parent populations sequences
- // and optionally a phylogenetic representation.
- let mut dataset = dataset::load::dataset(&args.dataset_dir, &args.mask)?;
-
- // init a container to hold query sequences, dataset
- // populations and/or sequences from an input alignment
- let mut sequences = Vec::new();
- // keep track of ids we've seen to remove duplicates later
- let mut ids_seen = Vec::new();
-
- // ------------------------------------------------------------------------
- // Parse Input Populations
- // ------------------------------------------------------------------------
-
- // this step is pretty fast, don't really need a progress bar here
-
- if let Some(populations) = &args.input.populations {
- info!("Parsing input populations: {populations:?}");
-
- dataset.expand_populations(populations)?.into_iter().for_each(|p| {
- if !dataset.populations.contains_key(&p) {
- warn!("Population {p} is not in the dataset populations fasta.");
- } else {
- debug!("Adding population {p} to query sequences.");
- let mut sequence = dataset.populations[&p].clone();
- sequence.id = format!("population_{}", sequence.id);
- ids_seen.push(sequence.id.clone());
- sequences.push(sequence);
- }
- });
- }
-
- // ------------------------------------------------------------------------
- // Parse Input Alignment
- // ------------------------------------------------------------------------
-
- if let Some(alignment) = &args.input.alignment {
- info!("Loading query alignment: {:?}", alignment);
- let alignment_reader = fasta::Reader::from_file(alignment)
- .map_err(|e| eyre!(e))
- .wrap_err("Failed to read file: {alignment:?}")?;
-
- for result in alignment_reader.records() {
- let record = result.wrap_err("Unable to parse alignment: {alignment:?}")?;
- let sequence =
- Sequence::from_record(record, Some(&dataset.reference), &args.mask)?;
-
- // check for duplicates
- if ids_seen.contains(&sequence.id) {
- warn!(
- "Sequence {} is duplicated, retaining first one.",
- sequence.id
- );
- continue;
- } else {
- ids_seen.push(sequence.id.clone());
- sequences.push(sequence);
- }
- }
- }
-
- // ------------------------------------------------------------------------
- // Parse and expand input parents
-
- if let Some(parents) = &args.parents {
- info!("Parsing input parents: {:?}", &parents);
- args.parents = Some(dataset.expand_populations(parents)?);
- }
-
- // ------------------------------------------------------------------------
- // Dataset Knockout
- // ------------------------------------------------------------------------
-
- if let Some(knockout) = &args.knockout {
- info!("Performing dataset knockout: {knockout:?}");
-
- let mut expanded_knockout = Vec::new();
-
- for p in knockout {
- // Replace the wildcard, knockout will always include descendants
- let p = p.replace('*', "");
-
- // Identify descendants
- let exclude_populations = if dataset.phylogeny.is_empty() {
- vec![p.to_string()]
- } else {
- dataset.phylogeny.get_descendants(&p)?
- };
-
- // remove from populations
- debug!("Removing {p}* from the populations fasta.");
- dataset.populations.retain(|id, _| !exclude_populations.contains(id));
-
- // remove from mutations
- debug!("Removing {p}* from the mutations.");
- dataset.mutations = dataset
- .mutations
- .into_iter()
- .filter_map(|(sub, mut populations)| {
- populations.retain(|p| !exclude_populations.contains(p));
- if populations.is_empty() {
- None
- } else {
- Some((sub, populations))
- }
- })
- .collect();
-
- // remove from phylogeny
- if !dataset.phylogeny.is_empty() {
- debug!("Removing {p}* from the phylogeny.");
- dataset.phylogeny = dataset.phylogeny.prune(&p)?;
- }
-
- expanded_knockout.extend(exclude_populations);
- }
-
- args.knockout = Some(expanded_knockout);
- }
-
- // ------------------------------------------------------------------------
- // Recombination Search
- // ------------------------------------------------------------------------
-
- info!("Running recombination search.");
-
- // this step is the slowest, use progress bar and parallel threads
- let progress_bar = ProgressBar::new(sequences.len() as u64);
- progress_bar.set_style(progress_bar_style);
-
- // adjust search populations based on args.parents and args.knockout
- let mut parent_search_populations = dataset.populations.keys().collect_vec();
- // if args.parents supplied on the CLI
- if let Some(populations) = &args.parents {
- parent_search_populations.retain(|pop| populations.contains(pop))
- }
- // if args.knockout supplied on the CLI
- if let Some(populations) = &args.knockout {
- parent_search_populations.retain(|pop| !populations.contains(pop))
- }
-
- // Search for the best match and recombination parents for each sequence.
- // This loop/closure is structured weirdly for rayon compatability, and the
- // fact that we need to return multiple types of objects
- let results: Vec<(SearchResult, Recombination)> = sequences
- .par_iter()
- .map(|sequence| {
- // initialize with default results, regardless of whether our
- // searches "succeed", we're going to return standardized data
- // structures to build our exports upon (ex. linelist columns)
- // which will include the "negative" results
- let mut best_match = SearchResult::new(sequence);
- let mut recombination = Recombination::new(sequence);
-
- // ------------------------------------------------------------------------
- // Best Match (Consensus)
- //
- // search for the best match in the dataset to this sequence.
- // this will represent the consensus population call.
-
- debug!("Identifying best match (consensus population).");
- let search_result = dataset.search(sequence, None, None);
-
- // if we found a match, proceed with recombinant search
- if let Ok(search_result) = search_result {
- best_match = search_result;
-
- debug!("Searching for recombination parents.");
- let parent_search = recombination::search::all_parents(
- sequence,
- &dataset,
- &mut best_match,
- &parent_search_populations,
- args,
- );
- match parent_search {
- Ok(search_result) => recombination = search_result,
- Err(e) => debug!("Parent search did not succeed. {e}"),
- }
- }
- // what to do if not a single population matched?
- else {
- // temporary handling for root population B
- if dataset.name == Name::SarsCov2 {
- if sequence.id == "population_B" {
- best_match.consensus_population = "B".to_string();
- }
- } else {
- debug!("No matches found.");
- }
- }
-
- progress_bar.inc(1);
-
- (best_match, recombination)
- })
- .collect();
-
- progress_bar.finish();
-
- // ------------------------------------------------------------------------
- // Export CLI args
-
- let outpath_args = args.output_dir.join("run_args.json");
- info!("Exporting CLI Run Args: {outpath_args:?}");
- // create output file
- let mut file = File::create(&outpath_args)
- .wrap_err_with(|| format!("Failed to create file: {outpath_args:?}"))?;
-
- // parse to string
- let output = serde_json::to_string_pretty(&args_export)
- .wrap_err_with(|| "Failed to parse mutations.".to_string())?;
-
- // write to file
- file.write_all(format!("{}\n", output).as_bytes())
- .wrap_err_with(|| format!("Failed to write file: {outpath_args:?}"))?;
-
- // ------------------------------------------------------------------------
- // Export Linelist (single)
-
- let outpath_linelist = args.output_dir.join("linelist.tsv");
- info!("Exporting linelist: {outpath_linelist:?}");
-
- let linelist_table = export::linelist(&results, &dataset)?;
- //let linelist_table = export::linelist(&best_matches, &recombinations, &dataset)?;
- linelist_table.write(&outpath_linelist)?;
-
- // ------------------------------------------------------------------------
- // Export Barcodes (multiple, collected by recombinant)
-
- let outdir_barcodes = args.output_dir.join("barcodes");
-
- create_dir_all(&outdir_barcodes)?;
-
- // get unique keys of recombinants identified
- let unique_keys = results
- .iter()
- .filter_map(|(_b, r)| (*r.unique_key != String::new()).then_some(&r.unique_key))
- .unique()
- .collect_vec();
-
- if unique_keys.is_empty() {
- warn!("No recombination detected, no barcodes will be outputted.");
- } else {
- info!("Exporting recombination barcodes: {outdir_barcodes:?}");
- }
-
- for unique_key in unique_keys {
- // filter recombinations down to just this recombinant unique_key
- let unique_rec = results
- .iter()
- .filter_map(|(_b, r)| (r.unique_key == *unique_key).then_some(r))
- .cloned()
- .collect_vec();
- // combine all the sample barcode tables
- let barcode_table =
- recombination::combine_tables(&unique_rec, &dataset.reference)?;
- let barcode_table_path = outdir_barcodes.join(format!("{unique_key}.tsv"));
- barcode_table.write(&barcode_table_path)?;
- }
-
- info!("Done.");
- Ok(())
-}
-
-/// Plot rebar output
-pub async fn plot(args: cli::plot::Args) -> Result<(), Report> {
- // ------------------------------------------------------------------------
- // Parse Args
-
- let dataset_dir = &args.dataset_dir;
- if !dataset_dir.exists() {
- return Err(eyre!("--dataset-dir {dataset_dir:?} does not exist."));
- }
- let output_dir = &args.output_dir;
- if !output_dir.exists() {
- return Err(eyre!("--output-dir {output_dir:?} does not exist."));
- }
-
- // ------------------------------------------------------------------------
- // Check Mandatory Paths
-
- let linelist = &output_dir.join("linelist.tsv");
- if !linelist.exists() {
- return Err(eyre!(
- "Linelist file {linelist:?} does not exist in --output-dir {output_dir:?}."
- ));
- }
- let barcodes_dir = &output_dir.join("barcodes");
- if !linelist.exists() {
- return Err(eyre!(
- "Barcodes directory {barcodes_dir:?} does not exist in --output-dir {output_dir:?}."
- ));
- }
-
- let annotations_path = dataset_dir.join("annotations.tsv");
- let annotations = if annotations_path.exists() {
- Some(annotations_path)
- } else {
- warn!(
- "Annotations {annotations_path:?} do not exist in --dataset-dir {dataset_dir:?}."
- );
- None
- };
-
- // create plot directory if it doesn't exist
- let plot_dir = args.plot_dir.unwrap_or(output_dir.join("plots"));
- if !plot_dir.exists() {
- info!("Creating plot directory: {plot_dir:?}");
- create_dir_all(&plot_dir)?;
- }
-
- // ------------------------------------------------------------------------
- // List of Barcodes Files
-
- let mut barcodes_files: Vec = Vec::new();
-
- // Input File Specified
- let barcodes_file = &args.barcodes_file;
- if let Some(barcodes_file) = barcodes_file {
- if !barcodes_file.exists() {
- return Err(eyre!("Barcodes file {barcodes_file:?} does not exist."));
- }
- barcodes_files.push(barcodes_file.clone());
- }
- // Otherwise, use all files in barcodes_dir
- else {
- let files = std::fs::read_dir(barcodes_dir)?;
- for result in files {
- let file_path = result?.path();
- let file_ext = file_path.extension().unwrap_or(std::ffi::OsStr::new(""));
- if file_ext == "tsv" {
- barcodes_files.push(file_path.clone());
- } else {
- warn!("Skipping barcodes file with unknown extension: {file_path:?}")
- }
- }
- }
-
- // ------------------------------------------------------------------------
- // Plot Each Barcodes
-
- for barcodes_file in barcodes_files {
- info!("Plotting barcodes file: {:?}", barcodes_file);
- let output_prefix = barcodes_file.file_stem().unwrap().to_str().unwrap();
- let output_path = plot_dir.join(format!("{}.png", output_prefix));
- let result = plot::create(
- &barcodes_file,
- linelist,
- annotations.as_deref(),
- &output_path,
- &args.font_cache,
- )
- .await;
- match result {
- Ok(_) => info!("Plotting success."),
- Err(e) => {
- // todo!() decide on whether we ignore or raise error
- return Err(e);
- //warn!("Plotting failure. The following error was encountered but ignored: {e:?}")
- }
- }
- }
-
- info!("Done.");
- Ok(())
-}
diff --git a/src/main.rs b/src/main.rs
index 6a35a8c..fd7b517 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -21,23 +21,19 @@ async fn main() -> Result<(), Report> {
// check which CLI command we're running (dataset, run, plot)
match args.command {
- // --------------------------------------------------------------------
// Dataset
Command::Dataset(args) => match args.command {
- dataset::Command::List(args) => rebar::list_datasets(&args).await?,
+ dataset::Command::List(args) => rebar::dataset::list::datasets(&args)?,
dataset::Command::Download(mut args) => {
- rebar::download_dataset(&mut args).await?
+ rebar::dataset::download::dataset(&mut args).await?
}
},
- // --------------------------------------------------------------------
// Run
- Command::Run(mut args) => rebar::run(&mut args)?,
- // --------------------------------------------------------------------
+ Command::Run(mut args) => rebar::run::run(&mut args)?,
// Plot
- Command::Plot(args) => rebar::plot(*args).await?,
- // --------------------------------------------------------------------
+ Command::Plot(args) => rebar::plot::plot(&args)?,
// Simulate
- Command::Simulate(args) => rebar::simulate(&args)?,
+ Command::Simulate(args) => rebar::simulate::simulate(&args)?,
}
Ok(())
diff --git a/src/phylogeny/mod.rs b/src/phylogeny/mod.rs
index 81604d5..546e69e 100644
--- a/src/phylogeny/mod.rs
+++ b/src/phylogeny/mod.rs
@@ -21,8 +21,10 @@ use std::string::ToString;
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct Phylogeny {
pub graph: Graph,
- pub order: Vec,
+ // we will parse recombinants on load/read
+ #[serde(skip_serializing, skip_deserializing)]
pub recombinants: Vec,
+ #[serde(skip_serializing, skip_deserializing)]
pub recombinants_all: Vec,
}
@@ -36,7 +38,6 @@ impl Phylogeny {
pub fn new() -> Self {
Phylogeny {
graph: Graph::new(),
- order: Vec::new(),
recombinants: Vec::new(),
recombinants_all: Vec::new(),
}
@@ -128,8 +129,6 @@ impl Phylogeny {
}
}
- // Remove from order
- phylogeny.order.retain(|n| !descendants.contains(n));
// Remove from recombinants
phylogeny.recombinants.retain(|r| !descendants.contains(r));
@@ -140,9 +139,12 @@ impl Phylogeny {
pub fn read(path: &Path) -> Result {
let phylogeny = std::fs::read_to_string(path)
.wrap_err_with(|| format!("Failed to read file: {path:?}."))?;
- let phylogeny = serde_json::from_str(&phylogeny)
+ let mut phylogeny: Phylogeny = serde_json::from_str(&phylogeny)
.wrap_err_with(|| format!("Failed to parse file: {path:?}."))?;
+ phylogeny.recombinants = phylogeny.get_recombinants()?;
+ phylogeny.recombinants_all = phylogeny.get_recombinants_all()?;
+
Ok(phylogeny)
}
@@ -374,7 +376,11 @@ impl Phylogeny {
// Which ancestors were found in all populations?
if populations.len() == names.len() {
// Which ancestor has the max depth?
- let depth = ancestor_depths[&ancestor];
+
+ let depth = ancestor_depths
+ .get(&ancestor)
+ .cloned()
+ .expect("Ancestor {ancestor} was not found in ancestor depths.");
if depth > max_depth {
max_depth = depth;
common_ancestor = ancestor;
diff --git a/src/plot/mod.rs b/src/plot/mod.rs
index 1dbfc8b..d8b8c38 100644
--- a/src/plot/mod.rs
+++ b/src/plot/mod.rs
@@ -2,21 +2,120 @@ pub mod constants;
pub mod polygon;
pub mod text;
+use crate::cli;
use crate::utils::table::Table;
use color_eyre::eyre::{eyre, Report, Result};
use color_eyre::Help;
use itertools::Itertools;
-use log::debug;
+use log::{debug, info, warn};
use raqote::*;
+use std::fs::create_dir_all;
use std::path::Path;
+/// Plot rebar output
+pub fn plot(args: &cli::plot::Args) -> Result<(), Report> {
+ // ------------------------------------------------------------------------
+ // Parse Args
+
+ let dataset_dir = &args.dataset_dir;
+ if !dataset_dir.exists() {
+ return Err(eyre!("--dataset-dir {dataset_dir:?} does not exist."));
+ }
+ let run_dir = &args.run_dir;
+ if !run_dir.exists() {
+ return Err(eyre!("--run-dir {run_dir:?} does not exist."));
+ }
+
+ // ------------------------------------------------------------------------
+ // Check Mandatory Paths
+
+ let linelist = &run_dir.join("linelist.tsv");
+ if !linelist.exists() {
+ return Err(eyre!(
+ "Linelist file {linelist:?} does not exist in --run-dir {run_dir:?}."
+ ));
+ }
+ let barcodes_dir = &run_dir.join("barcodes");
+ if !linelist.exists() {
+ return Err(eyre!(
+ "Barcodes directory {barcodes_dir:?} does not exist in --run-dir {run_dir:?}."
+ ));
+ }
+
+ let annotations_path = dataset_dir.join("annotations.tsv");
+ let annotations = if annotations_path.exists() {
+ Some(annotations_path)
+ } else {
+ warn!(
+ "Annotations {annotations_path:?} do not exist in --dataset-dir {dataset_dir:?}."
+ );
+ None
+ };
+
+ // create plot directory if it doesn't exist
+ let output_dir = args.output_dir.clone().unwrap_or(run_dir.join("plots"));
+ if !output_dir.exists() {
+ info!("Creating plot directory: {output_dir:?}");
+ create_dir_all(&output_dir)?;
+ }
+
+ // ------------------------------------------------------------------------
+ // List of Barcodes Files
+
+ let mut barcodes_files: Vec = Vec::new();
+
+ // Input File Specified
+ let barcodes_file = &args.barcodes_file;
+ if let Some(barcodes_file) = barcodes_file {
+ if !barcodes_file.exists() {
+ return Err(eyre!("Barcodes file {barcodes_file:?} does not exist."));
+ }
+ barcodes_files.push(barcodes_file.clone());
+ }
+ // Otherwise, use all files in barcodes_dir
+ else {
+ let files = std::fs::read_dir(barcodes_dir)?;
+ for result in files {
+ let file_path = result?.path();
+ let file_ext = file_path.extension().unwrap_or(std::ffi::OsStr::new(""));
+ if file_ext == "tsv" {
+ barcodes_files.push(file_path.clone());
+ } else {
+ warn!("Skipping barcodes file with unknown extension: {file_path:?}")
+ }
+ }
+ }
+
+ // ------------------------------------------------------------------------
+ // Plot Each Barcodes
+
+ for barcodes_file in barcodes_files {
+ info!("Plotting barcodes file: {:?}", barcodes_file);
+ let output_prefix = barcodes_file
+ .file_stem()
+ .expect("Failed to get file stem of {barcodes_file:?}")
+ .to_str()
+ .expect("Failed to convert file of stem {barcodes_file:?} to str.");
+ let output_path = output_dir.join(format!("{}.png", output_prefix));
+ create(
+ &barcodes_file,
+ linelist,
+ annotations.as_deref(),
+ &output_path,
+ )?;
+ info!("Plotting success.");
+ }
+
+ info!("Done.");
+ Ok(())
+}
+
#[allow(unused_variables)]
-pub async fn create(
+pub fn create(
barcodes_path: &Path,
linelist_path: &Path,
annotations_path: Option<&Path>,
output_path: &Path,
- font_cache: &Path,
) -> Result<(), Report> {
// ------------------------------------------------------------------------
// Import Data
diff --git a/src/recombination/search.rs b/src/recombination/search.rs
index 2ab9f24..08e7c4e 100644
--- a/src/recombination/search.rs
+++ b/src/recombination/search.rs
@@ -4,7 +4,7 @@ use crate::recombination::{detect_recombination, validate, Hypothesis, Recombina
use crate::sequence::Sequence;
use color_eyre::eyre::{eyre, Report, Result};
use itertools::Itertools;
-use log::debug;
+use log::{debug, warn};
use std::collections::BTreeMap;
use strum::IntoEnumIterator;
@@ -243,13 +243,12 @@ pub fn all_parents<'seq>(
// if the conflict range between hypotheses is large (>=10), prefer min_conflict
// otherwise, prefer max_score
- let best_hypothesis = if conflict_range >= conflict_threshold {
+ let best_hypotheses = if conflict_range >= conflict_threshold {
debug!("Best hypothesis selected by MIN CONFLICT. Conflict range ({conflict_range}) >= threshold ({conflict_threshold}).");
hypotheses
.iter()
.filter_map(|(hyp, (_r, _p, _s, c))| (c == min_conflict).then_some(hyp))
- .next()
- .unwrap()
+ .collect_vec()
} else {
debug!("Best hypothesis selected by MAX SCORE. Conflict range ({conflict_range}) < threshold ({conflict_threshold})");
let max_score =
@@ -257,11 +256,30 @@ pub fn all_parents<'seq>(
hypotheses
.iter()
.filter_map(|(hyp, (_r, _p, s, _c))| (s == max_score).then_some(hyp))
- .next()
- .unwrap()
+ .collect_vec()
};
- best_hypothesis.clone()
+ // if hypotheses are tied, prefer them in enum order (first before last)
+ // note: this currently means Designated is preferred over non-designated.
+ // prefer this for now, since we can use --naive to disable designated
+ let hypothesis_ranks: BTreeMap =
+ Hypothesis::iter().enumerate().map(|(i, h)| (h, i)).collect();
+ let best_hyp_rank = best_hypotheses
+ .iter()
+ .map(|hyp| {
+ hypothesis_ranks
+ .get(hyp)
+ .expect("Hypothesis ranks does not contain hypothesis {hyp:?}")
+ })
+ .min()
+ .unwrap();
+ let best_hypothesis = hypothesis_ranks
+ .iter()
+ .filter_map(|(hyp, r)| (r == best_hyp_rank).then_some(hyp))
+ .next()
+ .unwrap();
+
+ best_hypothesis.to_owned()
};
debug!("best_hypothesis: {best_hypothesis:?}");
@@ -270,7 +288,9 @@ pub fn all_parents<'seq>(
if best_hypothesis == Hypothesis::NonRecombinant {
return Err(eyre!("Best hypothesis is Non-Recombinant."));
}
- let result = hypotheses.remove(&best_hypothesis).unwrap();
+ let result = hypotheses
+ .remove(&best_hypothesis)
+ .expect("Hypotheses does not contain the best hypothesis {best_hypothesis:?}");
let mut recombination = result.0.unwrap();
let primary_parent = result.1[0].clone();
@@ -424,7 +444,16 @@ pub fn secondary_parents<'seq>(
let conflict_ref = parents
.iter()
// get conflict REF between this parent and the sequence
- .flat_map(|p| &p.conflict_ref[&p.consensus_population])
+ .flat_map(|p| {
+ if !p.conflict_ref.contains_key(&p.consensus_population) {
+ warn!(
+ "Parent {:?} has no conflict_ref recorded for it's consensus population {:?}",
+ &p.sequence_id,
+ &p.consensus_population,
+ );
+ }
+ p.conflict_ref.get(&p.consensus_population).cloned().unwrap_or_default()
+ })
.unique()
// search for parents that have the REF base (no ALT)
.filter(|sub| {
diff --git a/src/run/mod.rs b/src/run/mod.rs
new file mode 100644
index 0000000..80603fe
--- /dev/null
+++ b/src/run/mod.rs
@@ -0,0 +1,337 @@
+use crate::cli;
+use crate::dataset;
+use crate::export;
+use crate::recombination;
+
+use crate::dataset::{attributes::Name, SearchResult};
+use crate::recombination::Recombination;
+use crate::sequence::Sequence;
+use bio::io::fasta;
+use color_eyre::eyre::{eyre, Report, Result, WrapErr};
+use color_eyre::Help;
+use indicatif::{style::ProgressStyle, ProgressBar};
+use itertools::Itertools;
+use log::{debug, info, warn};
+use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
+use std::fs::{create_dir_all, File};
+use std::io::Write;
+
+/// Run rebar on input alignment and/or dataset population(s)
+pub fn run(args: &mut cli::run::Args) -> Result<(), Report> {
+ // copy args for export/seralizing
+ let args_export = args.clone();
+
+ // create output directory if it doesn't exist
+ if !args.output_dir.exists() {
+ info!("Creating output directory: {:?}", args.output_dir);
+ create_dir_all(&args.output_dir)?;
+ }
+ // make sure output directory is empty!
+ else {
+ let output_dir_is_empty = args.output_dir.read_dir()?.next().is_none();
+ if !output_dir_is_empty {
+ return Err(eyre!(
+ "--output-dir {:?} already exists and is not empty!",
+ args.output_dir
+ )
+ .suggestion("Please change your --output-dir to a new or empty directory."));
+ }
+ }
+
+ // check how many threads are available on the system
+ let default_thread_pool =
+ rayon::ThreadPoolBuilder::new().build().expect("Failed to build thread pool.");
+ info!(
+ "Number of threads available: {}",
+ default_thread_pool.current_num_threads()
+ );
+
+ // warn the user if they requested more than their system has available
+ // if so, default to the system threads
+ let mut num_threads = args.threads;
+ if args.threads > default_thread_pool.current_num_threads() {
+ warn!(
+ "Requested --threads {} is greater than the available threads.",
+ args.threads
+ );
+ num_threads = default_thread_pool.current_num_threads();
+ }
+
+ // configure the global thread pool
+ info!("Using {} thread(s).", num_threads);
+ let result = rayon::ThreadPoolBuilder::new().num_threads(num_threads).build_global();
+
+ if result.is_err() {
+ warn!("Failed to build global thread pool.");
+ }
+
+ // configure progress bar style
+ let progress_bar_style = ProgressStyle::with_template(
+ "{bar:40} {pos}/{len} ({percent}%) | Sequences / Second: {per_sec} | Elapsed: {elapsed_precise} | ETA: {eta_precise}"
+ ).expect("Failed to create progress bar from template.");
+
+ // Collect files in dataset_dir into a dataset object
+ // This mainly includes parent populations sequences
+ // and optionally a phylogenetic representation.
+ let mut dataset = dataset::load::dataset(&args.dataset_dir, &args.mask)?;
+
+ // init a container to hold query sequences, dataset
+ // populations and/or sequences from an input alignment
+ let mut sequences = Vec::new();
+ // keep track of ids we've seen to remove duplicates later
+ let mut ids_seen = Vec::new();
+
+ // ------------------------------------------------------------------------
+ // Parse Input Populations
+ // ------------------------------------------------------------------------
+
+ // this step is pretty fast, don't really need a progress bar here
+
+ if let Some(populations) = &args.input.populations {
+ info!("Parsing input populations: {populations:?}");
+
+ dataset.expand_populations(populations)?.into_iter().for_each(|p| {
+ if !dataset.populations.contains_key(&p) {
+ warn!("Population {p} is not in the dataset populations fasta.");
+ } else {
+ debug!("Adding population {p} to query sequences.");
+ let mut sequence = dataset.populations.get(&p).unwrap().clone();
+ sequence.id = format!("population_{}", sequence.id);
+ ids_seen.push(sequence.id.clone());
+ sequences.push(sequence.clone());
+ }
+ });
+ }
+
+ // ------------------------------------------------------------------------
+ // Parse Input Alignment
+ // ------------------------------------------------------------------------
+
+ if let Some(alignment) = &args.input.alignment {
+ info!("Loading query alignment: {:?}", alignment);
+ let alignment_reader = fasta::Reader::from_file(alignment)
+ .map_err(|e| eyre!(e))
+ .wrap_err("Failed to read file: {alignment:?}")?;
+
+ for result in alignment_reader.records() {
+ let record = result.wrap_err("Unable to parse alignment: {alignment:?}")?;
+ let sequence =
+ Sequence::from_record(record, Some(&dataset.reference), &args.mask)?;
+
+ // check for duplicates
+ if ids_seen.contains(&sequence.id) {
+ warn!(
+ "Sequence {} is duplicated, retaining first one.",
+ sequence.id
+ );
+ continue;
+ } else {
+ ids_seen.push(sequence.id.clone());
+ sequences.push(sequence);
+ }
+ }
+ }
+
+ // ------------------------------------------------------------------------
+ // Parse and expand input parents
+
+ if let Some(parents) = &args.parents {
+ info!("Parsing input parents: {:?}", &parents);
+ args.parents = Some(dataset.expand_populations(parents)?);
+ }
+
+ // ------------------------------------------------------------------------
+ // Dataset Knockout
+ // ------------------------------------------------------------------------
+
+ if let Some(knockout) = &args.knockout {
+ info!("Performing dataset knockout: {knockout:?}");
+
+ let mut expanded_knockout = Vec::new();
+
+ for p in knockout {
+ // Replace the wildcard, knockout will always include descendants
+ let p = p.replace('*', "");
+
+ // Identify descendants
+ let exclude_populations = if dataset.phylogeny.is_empty() {
+ vec![p.to_string()]
+ } else {
+ dataset.phylogeny.get_descendants(&p)?
+ };
+
+ // remove from populations
+ debug!("Removing {p}* from the populations fasta.");
+ dataset.populations.retain(|id, _| !exclude_populations.contains(id));
+
+ // remove from mutations
+ debug!("Removing {p}* from the mutations.");
+ dataset.mutations = dataset
+ .mutations
+ .into_iter()
+ .filter_map(|(sub, mut populations)| {
+ populations.retain(|p| !exclude_populations.contains(p));
+ if populations.is_empty() {
+ None
+ } else {
+ Some((sub, populations))
+ }
+ })
+ .collect();
+
+ // remove from phylogeny
+ if !dataset.phylogeny.is_empty() {
+ debug!("Removing {p}* from the phylogeny.");
+ dataset.phylogeny = dataset.phylogeny.prune(&p)?;
+ }
+
+ expanded_knockout.extend(exclude_populations);
+ }
+
+ args.knockout = Some(expanded_knockout);
+ }
+
+ // ------------------------------------------------------------------------
+ // Recombination Search
+ // ------------------------------------------------------------------------
+
+ info!("Running recombination search.");
+
+ // this step is the slowest, use progress bar and parallel threads
+ let progress_bar = ProgressBar::new(sequences.len() as u64);
+ progress_bar.set_style(progress_bar_style);
+
+ // adjust search populations based on args.parents and args.knockout
+ let mut parent_search_populations = dataset.populations.keys().collect_vec();
+ // if args.parents supplied on the CLI
+ if let Some(populations) = &args.parents {
+ parent_search_populations.retain(|pop| populations.contains(pop))
+ }
+ // if args.knockout supplied on the CLI
+ if let Some(populations) = &args.knockout {
+ parent_search_populations.retain(|pop| !populations.contains(pop))
+ }
+
+ // Search for the best match and recombination parents for each sequence.
+ // This loop/closure is structured weirdly for rayon compatability, and the
+ // fact that we need to return multiple types of objects
+ let results: Vec<(SearchResult, Recombination)> = sequences
+ .par_iter()
+ .map(|sequence| {
+ // initialize with default results, regardless of whether our
+ // searches "succeed", we're going to return standardized data
+ // structures to build our exports upon (ex. linelist columns)
+ // which will include the "negative" results
+ let mut best_match = SearchResult::new(sequence);
+ let mut recombination = Recombination::new(sequence);
+
+ // ------------------------------------------------------------------------
+ // Best Match (Consensus)
+ //
+ // search for the best match in the dataset to this sequence.
+ // this will represent the consensus population call.
+
+ debug!("Identifying best match (consensus population).");
+ let search_result = dataset.search(sequence, None, None);
+
+ // if we found a match, proceed with recombinant search
+ if let Ok(search_result) = search_result {
+ best_match = search_result;
+
+ debug!("Searching for recombination parents.");
+ let parent_search = recombination::search::all_parents(
+ sequence,
+ &dataset,
+ &mut best_match,
+ &parent_search_populations,
+ args,
+ );
+ match parent_search {
+ Ok(search_result) => recombination = search_result,
+ Err(e) => debug!("Parent search did not succeed. {e}"),
+ }
+ }
+ // what to do if not a single population matched?
+ else {
+ // temporary handling for root population B
+ if dataset.name == Name::SarsCov2 {
+ if sequence.id == "population_B" {
+ best_match.consensus_population = "B".to_string();
+ }
+ } else {
+ debug!("No matches found.");
+ }
+ }
+
+ progress_bar.inc(1);
+
+ (best_match, recombination)
+ })
+ .collect();
+
+ progress_bar.finish();
+
+ // ------------------------------------------------------------------------
+ // Export CLI args
+
+ let outpath_args = args.output_dir.join("run_args.json");
+ info!("Exporting CLI Run Args: {outpath_args:?}");
+ // create output file
+ let mut file = File::create(&outpath_args)
+ .wrap_err_with(|| format!("Failed to create file: {outpath_args:?}"))?;
+
+ // parse to string
+ let output = serde_json::to_string_pretty(&args_export)
+ .wrap_err_with(|| "Failed to parse mutations.".to_string())?;
+
+ // write to file
+ file.write_all(format!("{}\n", output).as_bytes())
+ .wrap_err_with(|| format!("Failed to write file: {outpath_args:?}"))?;
+
+ // ------------------------------------------------------------------------
+ // Export Linelist (single)
+
+ let outpath_linelist = args.output_dir.join("linelist.tsv");
+ info!("Exporting linelist: {outpath_linelist:?}");
+
+ let linelist_table = export::linelist(&results, &dataset)?;
+ //let linelist_table = export::linelist(&best_matches, &recombinations, &dataset)?;
+ linelist_table.write(&outpath_linelist)?;
+
+ // ------------------------------------------------------------------------
+ // Export Barcodes (multiple, collected by recombinant)
+
+ let outdir_barcodes = args.output_dir.join("barcodes");
+
+ create_dir_all(&outdir_barcodes)?;
+
+ // get unique keys of recombinants identified
+ let unique_keys = results
+ .iter()
+ .filter_map(|(_b, r)| (*r.unique_key != String::new()).then_some(&r.unique_key))
+ .unique()
+ .collect_vec();
+
+ if unique_keys.is_empty() {
+ warn!("No recombination detected, no barcodes will be outputted.");
+ } else {
+ info!("Exporting recombination barcodes: {outdir_barcodes:?}");
+ }
+
+ for unique_key in unique_keys {
+ // filter recombinations down to just this recombinant unique_key
+ let unique_rec = results
+ .iter()
+ .filter_map(|(_b, r)| (r.unique_key == *unique_key).then_some(r))
+ .cloned()
+ .collect_vec();
+ // combine all the sample barcode tables
+ let barcode_table =
+ recombination::combine_tables(&unique_rec, &dataset.reference)?;
+ let barcode_table_path = outdir_barcodes.join(format!("{unique_key}.tsv"));
+ barcode_table.write(&barcode_table_path)?;
+ }
+
+ info!("Done.");
+ Ok(())
+}
diff --git a/src/sequence/mod.rs b/src/sequence/mod.rs
index 4c5d680..db0d551 100644
--- a/src/sequence/mod.rs
+++ b/src/sequence/mod.rs
@@ -155,6 +155,7 @@ impl Sequence {
return Err(
eyre!("5' and 3' masking ({mask:?}) is incompatible with sequence length {}", sample.seq.len())
.suggestion("Please change your --mask parameter.")
+ .suggestion("Maybe you want to disable masking all together with --mask 0,0 ?")
);
}
}
diff --git a/src/simulate/mod.rs b/src/simulate/mod.rs
new file mode 100644
index 0000000..0b981e3
--- /dev/null
+++ b/src/simulate/mod.rs
@@ -0,0 +1,112 @@
+use crate::cli;
+use crate::dataset;
+use crate::recombination;
+
+use color_eyre::eyre::{eyre, Report, Result, WrapErr};
+use itertools::Itertools;
+use log::{debug, info};
+use rand::Rng;
+use std::fs::{create_dir_all, File};
+use std::io::Write;
+
+/// Simulate recombination.
+pub fn simulate(args: &cli::simulate::Args) -> Result<(), Report> {
+ // create output directory if it doesn't exist
+ if !args.output_dir.exists() {
+ info!("Creating output directory: {:?}", args.output_dir);
+ create_dir_all(&args.output_dir)?;
+ }
+
+ // Load dataset, disable masking
+ info!("Loading dataset: {:?}", &args.dataset_dir);
+ let mask = vec![0, 0];
+ let dataset = dataset::load::dataset(&args.dataset_dir, &mask)?;
+ let genome_length = dataset.reference.genome_length;
+
+ // Check to make sure all parents are in dataset
+ let parents = args.parents.clone();
+ for parent in &parents {
+ if !dataset.populations.contains_key(parent.as_str()) {
+ return Err(eyre!(
+ "Parent {parent} is not the dataset populations fasta."
+ ));
+ }
+ }
+
+ // ------------------------------------------------------------------------
+ // Breakpoints
+
+ let mut breakpoints = if let Some(breakpoints) = &args.breakpoints {
+ info!("Using manual breakpoints: {breakpoints:?}");
+ breakpoints.clone()
+ } else {
+ let mut breakpoints = Vec::new();
+ let mut num_breakpoints_remaining = parents.len() - 1;
+ let mut start = 1;
+ let mut rng = rand::thread_rng();
+ while num_breakpoints_remaining > 0 {
+ // save some coordinates for future breakpoints
+ let end = genome_length - num_breakpoints_remaining;
+ let coord = rng.gen_range(start..end);
+ breakpoints.push(coord);
+ start = coord + 1;
+ num_breakpoints_remaining -= 1;
+ }
+ info!("Using random breakpoints: {breakpoints:?}");
+ breakpoints
+ };
+
+ let unique_key = format!(
+ "simulate_{}_{}",
+ &parents.iter().join("_"),
+ &breakpoints.iter().map(|start| format!("{start}-{}", start + 1)).join("_"),
+ );
+ info!("Unique Key: {unique_key:?}");
+
+ // ------------------------------------------------------------------------
+ // Regions
+
+ breakpoints.push(genome_length);
+ let mut regions = Vec::new();
+ let mut start = 1;
+
+ for (origin, end) in parents.into_iter().zip(breakpoints.into_iter()) {
+ let region = recombination::Region {
+ start,
+ end,
+ origin,
+ substitutions: Vec::new(),
+ };
+ regions.push(region);
+ start = end + 1;
+ }
+ debug!("Regions: {regions:?}");
+
+ // ------------------------------------------------------------------------
+ // Sequences
+
+ let sequence: String = regions
+ .iter()
+ .map(|region| {
+ let sequence = dataset.populations.get(®ion.origin).unwrap_or_else(|| {
+ panic!(
+ "Failed to find region origin {} in dataset populations.",
+ ®ion.origin
+ )
+ });
+ // Reminder, -1 to coordinates since they are 1-based
+ sequence.seq[region.start - 1..=region.end - 1].iter().collect::()
+ })
+ .collect();
+
+ let output_path = args.output_dir.join(format!("{unique_key}.fasta"));
+ info!("Exporting fasta: {output_path:?}");
+ let mut output_file = File::create(&output_path)
+ .wrap_err_with(|| format!("Unable to create file: {output_path:?}"))?;
+ let lines = format!(">{unique_key}\n{sequence}");
+ output_file
+ .write_all(lines.as_bytes())
+ .wrap_err_with(|| format!("Unable to write file: {output_path:?}"))?;
+
+ Ok(())
+}
diff --git a/src/test/mod.rs b/src/test/mod.rs
deleted file mode 100644
index a0a8931..0000000
--- a/src/test/mod.rs
+++ /dev/null
@@ -1,47 +0,0 @@
-use crate::cli;
-use crate::dataset::attributes::{Name, Tag};
-use crate::dataset::download;
-use color_eyre::eyre::{Report, Result};
-use std::path::PathBuf;
-use std::str::FromStr;
-
-#[tokio::test]
-async fn example_1() -> Result<(), Report> {
- // ------------------------------------------------------------------------
- // Dataset Download
-
- let name = Name::SarsCov2;
- let tag = Tag::from_str("2023-11-17")?;
- let output_dir = PathBuf::from("test").join("example_1").join("dataset");
- let summary = None;
-
- let mut args = cli::dataset::download::Args {
- name,
- tag,
- output_dir: output_dir.clone(),
- summary,
- };
-
- download::dataset(&mut args).await?;
-
- // // ------------------------------------------------------------------------
- // // Run
-
- // let population = Some("AY.4.2*,BA.5.2,XBC.1.6*,XBB.1.5.1,XBL".to_string());
- // let dataset_dir = output_dir;
- // let output_dir = PathBuf::from("test").join("example_1").join("run");
-
- // let mut args = cli::run::Args {
- // population,
- // dataset_dir: dataset_dir.clone(),
- // output_dir: output_dir.clone(),
- // ..Default::default()
- // };
-
- // run(&mut args)?;
-
- // ------------------------------------------------------------------------
- // Plot
-
- Ok(())
-}
diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs
new file mode 100644
index 0000000..3290b6b
--- /dev/null
+++ b/tests/integration_tests.rs
@@ -0,0 +1,78 @@
+use rebar::cli;
+use rebar::dataset::attributes::{Name, Tag};
+use rebar::dataset::download;
+use rebar::plot::plot;
+use rebar::run::run;
+
+use color_eyre::eyre::{Report, Result};
+use std::path::PathBuf;
+use std::str::FromStr;
+
+#[tokio::test]
+async fn toy1() -> Result<(), Report> {
+ let output_dir = PathBuf::from("output").join("tests").join("toy1");
+
+ // Dataset Download
+ let mut args = cli::dataset::download::Args {
+ name: Name::Toy1,
+ tag: Tag::from_str("custom")?,
+ output_dir: output_dir.join("dataset"),
+ summary: None,
+ };
+ download::dataset(&mut args).await?;
+
+ // Run
+ let mut args = cli::run::Args {
+ population: Some("*".to_string()),
+ dataset_dir: output_dir.join("dataset"),
+ output_dir: output_dir.join("run"),
+ mask: vec![0, 0],
+ min_length: 3,
+ ..Default::default()
+ };
+ run(&mut args)?;
+
+ // Plot
+ let args = cli::plot::Args {
+ dataset_dir: output_dir.join("dataset"),
+ run_dir: output_dir.join("run"),
+ ..Default::default()
+ };
+ plot(&args)?;
+
+ Ok(())
+}
+
+#[tokio::test]
+async fn sarscov2_populations() -> Result<(), Report> {
+ let output_dir =
+ PathBuf::from("output").join("tests").join("sarscov2").join("populations");
+
+ // Dataset Download
+ let mut args = cli::dataset::download::Args {
+ name: Name::SarsCov2,
+ tag: Tag::from_str("2023-11-17")?,
+ output_dir: output_dir.join("dataset"),
+ summary: None,
+ };
+ download::dataset(&mut args).await?;
+
+ // Run
+ let mut args = cli::run::Args {
+ population: Some("AY.4.2*,BA.5.2,XBC.1.6*,XBB.1.5.1,XBL".to_string()),
+ dataset_dir: output_dir.join("dataset"),
+ output_dir: output_dir.join("run"),
+ ..Default::default()
+ };
+ run(&mut args)?;
+
+ // Plot
+ let args = cli::plot::Args {
+ dataset_dir: output_dir.join("dataset"),
+ run_dir: output_dir.join("run"),
+ ..Default::default()
+ };
+ plot(&args)?;
+
+ Ok(())
+}