From 9166f24a1aed4633a0e5103333c09b7dc84c63f9 Mon Sep 17 00:00:00 2001
From: farchaab <farid.chaabane@chuv.ch>
Date: Wed, 4 Sep 2024 13:09:56 +0200
Subject: [PATCH 1/4] remove spaces in dataframe

---
 mess/workflow/scripts/samples.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mess/workflow/scripts/samples.py b/mess/workflow/scripts/samples.py
index a5e91d8..01203d9 100644
--- a/mess/workflow/scripts/samples.py
+++ b/mess/workflow/scripts/samples.py
@@ -11,6 +11,8 @@
 dfs = []
 for file in files:
     df = pd.read_csv(file, sep="\t")
+    df.columns = df.columns.str.replace(" ", "")
+    df = df.map(lambda x: x.replace(" ", "") if isinstance(x, str) else x)
     dfs.append(df)
     try:
         samples = list(set(df["sample"]))

From 50b1bf12b64cccc10089cca16f54a04221b02276 Mon Sep 17 00:00:00 2001
From: farchaab <farid.chaabane@chuv.ch>
Date: Wed, 4 Sep 2024 13:10:25 +0200
Subject: [PATCH 2/4] fix duplicated fasta paths

---
 mess/workflow/rules/preflight/functions.smk | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/mess/workflow/rules/preflight/functions.smk b/mess/workflow/rules/preflight/functions.smk
index 00ccdcd..08e7bfc 100644
--- a/mess/workflow/rules/preflight/functions.smk
+++ b/mess/workflow/rules/preflight/functions.smk
@@ -69,16 +69,21 @@ fasta_cache = {}
 
 def fasta_input(wildcards):
     table = checkpoints.calculate_genome_coverages.get(**wildcards).output[0]
-    if table not in fasta_cache:
-        df = pd.read_csv(table, sep="\t", index_col="fasta")
-        fasta_cache[table] = df
-    df = fasta_cache[table]
-    return df.loc[wildcards.fasta]["path"]
+
+    df = pd.read_csv(table, sep="\t", index_col="fasta")
+    try:
+        return df.loc[wildcards.fasta]["path"].drop_duplicates()
+    except AttributeError:
+        return df.loc[wildcards.fasta]["path"]
+    # some samples use the same genome path, drop duplicates to avoid duplicate paths when processing fasta
 
 
 def list_fastas(wildcards):
     table = checkpoints.calculate_genome_coverages.get(**wildcards).output[0]
-    df = pd.read_csv(table, sep="\t")
+    if table not in fasta_cache:
+        df = pd.read_csv(table, sep="\t")
+        fasta_cache[table] = df
+    df = fasta_cache[table]
     fastas = list(set(df["fasta"]))
     return expand(os.path.join(dir.out.processing, "{fasta}.fasta"), fasta=fastas)
 

From d3168354f989d70cd213caf80b894d19bd0bdf65 Mon Sep 17 00:00:00 2001
From: farchaab <farid.chaabane@chuv.ch>
Date: Wed, 4 Sep 2024 13:10:37 +0200
Subject: [PATCH 3/4] updated readme

---
 README.md | 60 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 34 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 4a25b3f..e3481e4 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@
 
 [![](https://img.shields.io/static/v1?label=CLI&message=Snaketool&color=blueviolet)](https://github.com/beardymcjohnface/Snaketool)
 [![license](https://img.shields.io/github/license/metagenlab/mess.svg)](https://github.com/metagenlab/MeSS/blob/main/LICENSE)
+[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/mess/README.html)
 [![version](https://img.shields.io/conda/vn/bioconda/mess?color=blue)](http://bioconda.github.io/recipes/mess/README.html)
 [![downloads](https://img.shields.io/conda/dn/bioconda/mess.svg)](https://anaconda.org/bioconda/mess)
 
@@ -25,7 +26,7 @@ input["samples.tsv
 or 
 samples/*.tsv"] --> taxons
 
-subgraph genome_download["genome download"]
+subgraph genome_download["`**genome download**`"]
 dlchoice{download ?}
 taxons["taxons or
 accesions"] --> dlchoice
@@ -35,7 +36,7 @@ assembly_finder --> fasta
 end
 
 input --> distchoice
-subgraph community_design["community design"]
+subgraph community_design["`**community design**`"]
 distchoice{draw distribution ?}
 distchoice -->|yes| dist["distribution 
 (lognormal, even)"]
@@ -58,9 +59,7 @@ simulator --> bam
 simulator --> fastq
 simulator --> CAMI-profile
 
-%% colors
-style genome_download color:black
-style community_design color:black
+%% subgraph color fills
 classDef red fill:#faeaea,color:#fff,stroke:#333;
 classDef blue fill:#eaecfa,color:#fff,stroke:#333;
 class genome_download blue
@@ -72,46 +71,55 @@ More details can be found in the [documentation](https://metagenlab.github.io/Me
 
 ## :zap: Quick start 
 ### Installation
-
-#### Mamba
-
-[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/mess/README.html)
-
+Mamba
 ```sh
 mamba create -n mess mess
 ```
 
-#### Docker
-
+Docker
 ```sh
 docker pull ghcr.io/metagenlab/mess:latest
 ```
 
-#### From source 
-
+From source 
 ```sh
 git clone https://github.com/metagenlab/MeSS.git
 pip install -e MeSS
 ```
 
 ### Usage
-
-
-#### Download and simulate
-
-Using the following file [minimal_test.tsv](https://github.com/metagenlab/MeSS/blob/main/mess/test_data/minimal_test.tsv)
-
+#### :arrow_right: Input
+Let's simulate two metagenomic samples with the following taxa and read counts in `samples.tsv`:
+| sample   | taxon | reads  |
+| ---      | ---   | ---    |
+| sample1  |  487  | 174840 |
+| sample1  |  727  | 90679  |
+| sample1  |  729  | 13129  |
+| sample2  | 28132 | 147863 |
+| sample2  | 199   | 147545 |
+| sample2  | 729   | 131300 |
+
+#### :rocket: Command
+Let's run MeSS (using apptainer as the software deployment method) !
 ```sh
-mess run -i minimal_test.tsv 
+mess run -i samples.tsv --sdm apptainer 
 ```
+#### :bar_chart: Resources usage
+Average resources usage measured 3 times with one CPU (within a [nextflow](https://github.com/nextflow-io/nextflow) process):
 
-#### Simulate from local fasta
+| task_id | hash      | native_id | name     | status    | exit | submit                  | duration | realtime | %cpu   | peak_rss | peak_vmem | rchar  | wchar  |
+| ------- | --------- | --------- | -------- | --------- | ---- | ----------------------- | -------- | -------- | ------ | -------- | --------- | ------ | ------ |
+| 1       | fe/03c2bc | 62286     | MESS (1) | COMPLETED | 0    | 2024-09-04 12:41:15.820 | 1m 50s   | 1m 50s   | 111.5% | 1.8 GB   | 9 GB      | 3.5 GB | 2.4 GB |
+| 1       | ff/0d03b1 | 73355     | MESS (1) | COMPLETED | 0    | 2024-09-04 12:55:12.903 | 1m 52s   | 1m 52s   | 112.6% | 1.7 GB   | 8.8 GB    | 3.5 GB | 2.4 GB |
+| 1       | 07/d352bf | 83576     | MESS (1) | COMPLETED | 0    | 2024-09-04 12:57:30.600 | 1m 50s   | 1m 50s   | 113.2% | 1.7 GB   | 8.9 GB    | 3.5 GB | 2.4 GB |
 
-Download the [fasta directory](https://github.com/metagenlab/MeSS/tree/main/mess/test_data/fastas) and [table](https://github.com/metagenlab/MeSS/blob/main/mess/test_data/simulate_test.tsv)
+> On average, using `samples.tsv`, MeSS runs in under 2min, while using around 1.8GB of physical RAM
+
+> [!NOTE]
+> Resources usage was measured exluding dependencies deployement time (conda env creation or container pulling)
+
+More details on resource usage in the [doc](https://metagenlab.github.io/MeSS/benchmarks/resource-usage/)
 
-```sh
-mess simulate -i simulate_test.tsv --fasta fasta 
-```
 
 ## :sos: Help
 

From 2822070d667441ce108ca1c118fe1c876df84f0e Mon Sep 17 00:00:00 2001
From: farchaab <farid.chaabane@chuv.ch>
Date: Wed, 4 Sep 2024 13:22:12 +0200
Subject: [PATCH 4/4] updated reamde

---
 README.md | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index e3481e4..0baf96f 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@
 
 The Metagenomic Sequence Simulator (MeSS) is a [Snakemake](https://github.com/snakemake/snakemake) pipeline, implemented using [Snaketool](https://github.com/beardymcjohnface/Snaketool), for simulating illumina, Oxford Nanopore (ONT) and Pacific Bioscience (PacBio) shotgun metagenomic samples.
 
-## :memo: Overview
+## :mag: Overview
 
 MeSS takes as input NCBI taxa or local genome assemblies to generate either long (PacBio or ONT) or short (illumina) reads. In addition to reads, MeSS optionally generates bam alignment files and taxonomic + sequence abundances in [CAMI format](https://github.com/bioboxes/rfc/blob/master/data-format/profiling.mkd).
 
@@ -70,7 +70,7 @@ class community_design red
 More details can be found in the [documentation](https://metagenlab.github.io/MeSS/)
 
 ## :zap: Quick start 
-### Installation
+### :gear: Installation
 Mamba
 ```sh
 mamba create -n mess mess
@@ -87,7 +87,7 @@ git clone https://github.com/metagenlab/MeSS.git
 pip install -e MeSS
 ```
 
-### Usage
+### :page_facing_up: Usage
 #### :arrow_right: Input
 Let's simulate two metagenomic samples with the following taxa and read counts in `samples.tsv`:
 | sample   | taxon | reads  |
@@ -104,6 +104,35 @@ Let's run MeSS (using apptainer as the software deployment method) !
 ```sh
 mess run -i samples.tsv --sdm apptainer 
 ```
+#### :card_index_dividers: Outputs
+
+```sh
+📦mess_out
+ ┣ 📂assembly_finder
+ ┃ ┣ 📂download
+ ┃ ┃ ┣ 📂GCF_000144405.1
+ ┃ ┃ ┃ ┗ 📜GCF_000144405.1_ASM14440v1_genomic.fna.gz
+ ┃ ┃ ┣ 📂GCF_001298465.1
+ ┃ ┃ ┃ ┗ 📜GCF_001298465.1_ASM129846v1_genomic.fna.gz
+ ┃ ┃ ┣ 📂GCF_016127215.1
+ ┃ ┃ ┃ ┗ 📜GCF_016127215.1_ASM1612721v1_genomic.fna.gz
+ ┃ ┃ ┣ 📂GCF_020736045.1
+ ┃ ┃ ┃ ┗ 📜GCF_020736045.1_ASM2073604v1_genomic.fna.gz
+ ┃ ┃ ┣ 📂GCF_022869645.1
+ ┃ ┃ ┃ ┗ 📜GCF_022869645.1_ASM2286964v1_genomic.fna.gz
+ ┃ ┃ ┗ 📜.snakemake_timestamp
+ ┣ 📂fastq
+ ┃ ┣ 📜sample1_R1.fq.gz
+ ┃ ┣ 📜sample1_R2.fq.gz
+ ┃ ┣ 📜sample2_R1.fq.gz
+ ┃ ┗ 📜sample2_R2.fq.gz
+ ┣ 📜config.yaml
+ ┣ 📜coverages.tsv
+ ┗ 📜mess.log
+```
+
+Outputs described in more details [here](https://metagenlab.github.io/MeSS/guide/output/)
+
 #### :bar_chart: Resources usage
 Average resources usage measured 3 times with one CPU (within a [nextflow](https://github.com/nextflow-io/nextflow) process):
 
@@ -118,7 +147,7 @@ Average resources usage measured 3 times with one CPU (within a [nextflow](https
 > [!NOTE]
 > Resources usage was measured exluding dependencies deployement time (conda env creation or container pulling)
 
-More details on resource usage in the [doc](https://metagenlab.github.io/MeSS/benchmarks/resource-usage/)
+More details on resource usage in the [documentation](https://metagenlab.github.io/MeSS/benchmarks/resource-usage/)
 
 
 ## :sos: Help