Skip to content

Commit

Permalink
Merge pull request #51 from TRON-Bioinformatics/move-download-section
Browse files Browse the repository at this point in the history
Move download section
  • Loading branch information
priesgo authored Oct 24, 2022
2 parents 0a824ff + b747209 commit d700773
Show file tree
Hide file tree
Showing 9 changed files with 123 additions and 49 deletions.
2 changes: 1 addition & 1 deletion covigator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
VERSION = "v1.0.0"
VERSION = "v1.1.0"
ANALYSIS_PIPELINE_VERSION = "v0.14.0"

MISSENSE_VARIANT = "missense_variant"
Expand Down
52 changes: 34 additions & 18 deletions covigator/dashboard/dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,14 @@
from covigator.database.model import DataSource
from covigator.database.queries import Queries

COVIGATOR_ENA_LOGO = "/assets/CoVigator_logo_ENA.png"
COVIGATOR_COVID19_LOGO = "/assets/CoVigator_logo_txt_reg_no_bg_covid19_portal.png"
COVIGATOR_LOGO = "/assets/CoVigator_logo_txt_reg_no_bg.png"
HOME_HREF = "/"
COVID_PORTAL_HREF = "/covid19-portal"
ENA_HREF = "/ena"
DOWNLOAD_HREF = "/download"
ACKNOWLEDGEMENTS_HREF = "/acknowledgements"
TAB_STYLE = {"color": "#003c78", 'margin-right': '15px'}

ID_TAB_CONTENT = "tab-content"
Expand Down Expand Up @@ -63,8 +71,8 @@ def serve_layout(self):
[
dbc.Row([
dbc.Col(
children=html.A(html.Img(src="/assets/CoVigator_logo_txt_reg_no_bg.png",
height="80px"), href="/"),
children=html.A(html.Img(src=COVIGATOR_LOGO,
height="80px"), href=HOME_HREF),
className="ml-2",
id="logo"
),
Expand Down Expand Up @@ -97,21 +105,24 @@ def serve_layout(self):
dbc.DropdownMenu(
label="Menu", children=[
dbc.DropdownMenuItem(
"Home", href="/", class_name="m-1",
"Home", href=HOME_HREF, class_name="m-1",
style={'font-size' : '150%', "color": "#003c78"}),
dbc.DropdownMenuItem(
"ENA dataset", href="/ena",
"ENA dashboard", href=ENA_HREF,
style={'font-size' : '150%', "color": "#003c78"}),
dbc.DropdownMenuItem(
"Covid19 Data Portal sequences dataset", href="/covid19-portal",
"COVID-19 Data Portal sequences dashboard", href=COVID_PORTAL_HREF,
style={'font-size': '150%', "color": "#003c78"}),
dbc.DropdownMenuItem(
"Documentation", href="https://covigator.readthedocs.io/en/latest",
target="_blank",
style={'font-size': '150%', "color": "#003c78"}),
dbc.DropdownMenuItem(
"Acknowledgements", href="/acknowledgements",
style={'font-size' : '150%', "color": "#003c78"}),
"Data download", href=DOWNLOAD_HREF,
style={'font-size': '150%', "color": "#003c78"}),
dbc.DropdownMenuItem(
"Acknowledgements", href=ACKNOWLEDGEMENTS_HREF,
style={'font-size': '150%', "color": "#003c78"}),
],
align_end=True,
size="lg",
Expand Down Expand Up @@ -207,16 +218,19 @@ def set_callbacks(app, session: Session, content_folder):
ENA_PAGE = DataSource.ENA
COVID19_PORTAL_PAGE = DataSource.COVID19_PORTAL
ACKNOWLEDGEMENTS_PAGE = "acknowledgements"
DOWNLOAD_PAGE = "download"

def _get_page(url):
if url in ["", "/"]:
if url in ["", HOME_HREF]:
return MAIN_PAGE
elif url == "/covid19-portal":
elif url == COVID_PORTAL_HREF:
return COVID19_PORTAL_PAGE
elif url == "/ena":
elif url == ENA_HREF:
return ENA_PAGE
elif url == "/acknowledgements":
elif url == ACKNOWLEDGEMENTS_HREF:
return ACKNOWLEDGEMENTS_PAGE
elif url == DOWNLOAD_HREF:
return DOWNLOAD_PAGE
else:
raise ValueError("This URL does not exist")

Expand Down Expand Up @@ -246,24 +260,26 @@ def switch_page(url):
dbc.Tab(label="Lineages", tab_id=LINEAGES_TAB_ID, label_style=TAB_STYLE),
dbc.Tab(label="Mutation statistics", tab_id=MUTATIONS_TAB_ID, label_style=TAB_STYLE),
dbc.Tab(label="Recurrent mutations", tab_id=RECURRENT_MUTATIONS_TAB_ID, label_style=TAB_STYLE),
dbc.Tab(label="Intrahost mutations", tab_id=INTRAHOST_MUTATIONS_TAB_ID, label_style=TAB_STYLE),
dbc.Tab(label="Download data", tab_id=DOWNLOAD_TAB_ID, label_style=TAB_STYLE)], ENA_DATASET_TAB_ID
dbc.Tab(label="Intrahost mutations", tab_id=INTRAHOST_MUTATIONS_TAB_ID, label_style=TAB_STYLE)], \
ENA_DATASET_TAB_ID
elif page == ACKNOWLEDGEMENTS_PAGE:
# show ena tabs
return [
dbc.Tab(label="Acknowledgements", tab_id=HELP_TAB_ID, label_style={"color": "#003c78", 'display': 'none'})], HELP_TAB_ID
elif page == DOWNLOAD_PAGE:
return [
dbc.Tab(label="Download", tab_id=DOWNLOAD_TAB_ID, label_style={"color": "#003c78", 'display': 'none'})], DOWNLOAD_TAB_ID

@app.callback(
Output('logo', "children"),
[Input("url", "pathname")])
def switch_logo(url):
page = _get_page(url)
if page == MAIN_PAGE or page == ACKNOWLEDGEMENTS_PAGE:
return html.A(html.Img(src="/assets/CoVigator_logo_txt_reg_no_bg.png", height="80px"), href="/")
if page in [MAIN_PAGE, ACKNOWLEDGEMENTS_PAGE, DOWNLOAD_PAGE]:
return html.A(html.Img(src=COVIGATOR_LOGO, height="80px"), href="/")
elif page == COVID19_PORTAL_PAGE:
return html.A(html.Img(src="/assets/CoVigator_logo_txt_reg_no_bg_covid19_portal.png", height="80px"), href="/")
return html.A(html.Img(src=COVIGATOR_COVID19_LOGO, height="80px"), href="/")
elif page == ENA_PAGE:
return html.A(html.Img(src="/assets/CoVigator_logo_ENA.png", height="80px"), href="/")
return html.A(html.Img(src=COVIGATOR_ENA_LOGO, height="80px"), href="/")

@app.callback(
Output('top-right-logo', "children"),
Expand Down
2 changes: 1 addition & 1 deletion covigator/dashboard/tabs/dataset_covid19_portal.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def get_covid19_portal_overview_tab_left_bar(queries: Queries, count_samples):
children=[
html.Br(),
dcc.Markdown("""
The Covid19 Data Portal (https://www.covid19dataportal.org/) provides among other things DNA assemblies,
The COVID-19 Data Portal (https://www.covid19dataportal.org/) provides among other things DNA assemblies,
geographical information and other metadata about SARS-CoV-2 samples.
The processing pipeline runs alignment to the reference genome (bioypthon),
variant calling (custom code), normalization (vt and BCFtools), annotation (SnpEff)
Expand Down
6 changes: 3 additions & 3 deletions covigator/dashboard/tabs/overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@ def get_tab_overview():
html.Br(),
html.P("""
CoVigator loads publicly available SARS-CoV-2 raw reads (ie: FASTQs) from
the European Nucleotide Archive (ENA) and sequences from the Covid19 Data Portal.
the European Nucleotide Archive (ENA) and sequences from the COVID-19 Data Portal.
Some samples are present in both datasets.
ENA enables a high resolution analysis into the SARS-CoV-2 mutations through the raw reads.
Intrahost mutations are of particular interest.
On the other hand, the Covid19 Data Portal sequences have a lower resolution,
On the other hand, the COVID-19 Data Portal sequences have a lower resolution,
but it is a more extensive dataset.
"""),
html.Br(),
Expand Down Expand Up @@ -76,7 +76,7 @@ def get_tab_overview():
dbc.CardBody(
[
dbc.Button(
"Explore data derived from the Covid19 Data Portal sequences", color="warning",
"Explore data derived from the COVID-19 Data Portal sequences", color="warning",
href="/covid19-portal",
style={"margin-left": "20px", "margin-right": "20px",
"font-size": 20}, ),
Expand Down
27 changes: 18 additions & 9 deletions docs/source/01_overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
[![Powered by Dash](https://img.shields.io/badge/powered%20by-Dask-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://dask.org/)
[![Powered by Dash](https://img.shields.io/badge/powered%20by-Nextflow-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://nextflow.io/)


**CoVigator dashboard**: [https://covigator.tron-mainz.de](https://covigator.tron-mainz.de)

Human infections with SARS-CoV-2 are spreading globally since the beginning of 2020, necessitating preventive or
Expand Down Expand Up @@ -41,14 +42,19 @@ visual analytics.
<b>Figure 2: Most frequent mutations in the spike protein</b>
</p>

CoVigator loads publicly available SARS-CoV-2 DNA sequences from the
[European Nucleotide Archive (ENA)](https://www.ebi.ac.uk/ena) providing raw reads in FASTQ format.
CoVigator loads publicly available SARS-CoV-2 DNA sequences from two databases:

* [European Nucleotide Archive (ENA)](https://www.ebi.ac.uk/ena) providing raw reads in FASTQ format.
* [COVID-19 Data Portal](https://www.covid19dataportal.org/) providing assemblies in FASTA format.

ENA enables a high resolution into the SARS-CoV-2 mutation details through the individual
There is certain overlap in the samples present in ENA and COVID-19 Data Portal as some national initiatives are systematically
reporting both FASTQ reads and FASTA assemblies. FASTQ reads enable a higher resolution into the SARS-CoV-2 mutation details through the individual
reads. This allows us to annotate mutations with a Variant Allele Frequency (VAF) and explore intrahost
mutations. At the moment we only process the Illumina samples from ENA.
This means excluding all of the Oxford Nanopore samples and hence having a partial view of all the
available data.
mutations. On the other hand, while we load all of the sequences from COVID-19 Data Portal database in CoVigator, we only process the Illumina
samples from ENA. This means excluding all of the Oxford Nanopore samples and hence having a partial view of all the
available data. Each of the datasets is available in a separate address
[https://covigator.tron-mainz.de/covid19-portal](https://covigator.tron-mainz.de/covid19-portal) and
[https://covigator.tron-mainz.de/ena](https://covigator.tron-mainz.de/ena), respectively.

The dashboard is implemented in the visualization framework [Dash](https://dash.plotly.com/).
The computation is distributed through our cluster with a library of similar name [Dask](https://dask.org/).
Expand Down Expand Up @@ -102,12 +108,14 @@ contribution to integrate Pangolin into the CoVigator pipeline.

We gratefully acknowledge all data contributors, i.e. the Authors and their Originating laboratories responsible for
obtaining the specimens, and their Submitting laboratories for generating the genetic sequence and metadata and sharing
via the European Nucleotide Archive (1), on which this research is based.
via the European Nucleotide Archive [1] and the COVID-19 Data Portal [2], on which this research is based.

1) Leinonen, R., Akhtar, R., Birney, E., Bower, L., Cerdeno-Tárraga, A., Cheng, Y., Cleland, I., Faruque, N.,
Goodgame, N., Gibson, R., Hoad, G., Jang, M., Pakseresht, N., Plaister, S., Radhakrishnan, R., Reddy, K.,
Sobhany, S., Hoopen, P. Ten, Vaughan, R., Zalunin V., Cochrane, G. (2011). The European nucleotide archive.
Nucleic Acids Research, 39(SUPPL. 1), D28. [10.1093/nar/gkq967](https://doi.org/10.1093/nar/gkq967)
2) “COVID-19 Data Portal - Accelerating Scientific Research through Data.” Accessed October 24, 2022. https://www.covid19dataportal.org/.



## A note on terminology
Expand All @@ -125,10 +133,11 @@ kept the use of variant in some scientific terms commonly used; these are:
* Multi Nucleotide Variant (MNV): a point mutation where more than one DNA base is substituted by another

There are two terms referring to a given mutation frequency:
* The Variant Allele Frequency (VAF) refers to the ratio of reads supporting a given mutation.
* The Variant Allele Frequency (VAF) refers to the ratio of reads supporting a given mutation.
The VAF can only be calculated on the ENA dataset.
The VAF is used to distinguish clonal and intrahost mutations.
* The mutation frequency on the other hand refers to the frequency in the population of samples.
Importantly intrahost mutations are not taken into account.
This is calculated on both datasets ENA and COVID-19 Data Portal, but importantly intrahost mutations are not taken into account.


------------------------
Expand Down
36 changes: 30 additions & 6 deletions docs/source/02_dashboard.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
[![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-Dash-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://dash.plotly.com/)
[![License](https://img.shields.io/badge/license-MIT-green)](https://opensource.org/licenses/MIT)


The Covigator dashboard provides the ability to navigate through a series of interactive plots that visualise
SARS-CoV-2 mutations derived from the ENA dataset. Our aim is to enable users to understand trends in the dataset of mutations in its
SARS-CoV-2 mutations derived from the ENA and COVID-19 Data Portal datasets. Our aim is to enable users to understand trends in the dataset of mutations in its
geographical and temporal context.

Here we describe how to use and interpret the plots in the dashboard.
Expand Down Expand Up @@ -49,7 +50,10 @@ A double click filters out all other elements except the clicked one. Another do

## Overview tab

The first tab show a set of statistics that help to assess the quality of the dataset.
The first tab in both dashboards show a set of statistics that help to assess the quality of the dataset.
These are different between ENA and COVID-19 Data Portal.

### ENA plots

* **Library strategies**. The distribution of samples across different library strategies (ie: WGS, WGA and
targeted-capture).
Expand All @@ -68,6 +72,19 @@ The first tab show a set of statistics that help to assess the quality of the da

![ENA dataset](_static/figures/screenshot_05_ena_dataset.png)

### COVID-19 Data Portal plots

* **Horizontal coverage**. The distribution of the horizontal coverage.
* X-axis: horizontal coverage %
* Y-axis: number of samples in logarithmic scale
* **Ratio of N and ambiguous bases**. The FASTA format supports ambiguous bases in the form of Ns or other ambiguous
categories that refer to more than one nucleotide base. The ambiguous sequence is unusable for our purposes.
* X-axis: ratio of N and ambiguous bases over sequence length
* Y-axis: number of samples in logarithmic scale

![COVID-19 Data Portal dataset](_static/figures/screenshot_06_c19d_dataset.png)


## Samples tab

The aim of this tab is to explore the accumulation of samples on different countries through time.
Expand Down Expand Up @@ -191,7 +208,7 @@ that most frequently co-occur.
replaced by a scatter plot with the mutations themselves and their frequency in the Y-axis.
* X-axis: genomic coordinates in base pairs
* **Co-occurrence matrix**. A heatmap showing the pairwise co-occurrence between mutations. This plot is only available when
a gene or protein domain has been selected.
a gene or protein domain has been selected; and only for the ENA dataset.
* **Co-occurrence clustering**. The list of clusters with all the mutations within each cluster.

![Recurrent mutations tab](_static/figures/screencast_03_recurrent_mutations_tab.gif)
Expand All @@ -200,6 +217,8 @@ that most frequently co-occur.

## Intrahost mutations tab

**NOTE**: only available for ENA dataset

The aim of this tab is to enable the exploration of intrahost mutations.
In CoVigator we classify as intrahost all mutation observations with a VAF below 0.8.
The same mutation can be observed as intrahost or clonal in different samples.
Expand Down Expand Up @@ -239,6 +258,11 @@ the temporal distribution across countries and finally the top 10 co-occurring c

## Download the raw data

The download tab provides the raw data generated by the CoVigator pipeline, this is the variant calls for every sample.
This is in a custom table with relevant annotations, unfortunately this is not a standard VCF file and the format of
the table may change in the future. The sample metadata is also provided.
The download section provides the raw data generated by the CoVigator pipeline as it is stored in the database.
This includes the variant calls for every sample in each dataset, its metadata and lineage annotations.

Only the latest data release is available for download, we do not maintain a history of data versions.

Unfortunately the data is not in the standard VCF format and the format of the table may change in the future.
Also, we do not provide the BAM files for download.
We are happy to attend specific data requests via our GitHub site.
1 change: 1 addition & 0 deletions docs/source/03_pipeline.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
[![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-Nextflow-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://www.nextflow.io/)
[![License](https://img.shields.io/badge/license-MIT-green)](https://opensource.org/licenses/MIT)


The Covigator pipeline processes SARS-CoV-2 FASTQ or FASTA files into annotated and normalized analysis ready VCF files.
It also classifies samples into lineages using pangolin.
The pipeline is implemented in the Nextflow framework (Di Tommaso, 2017), it is a stand-alone pipeline that can be
Expand Down
Loading

0 comments on commit d700773

Please sign in to comment.