Skip to content

Commit

Permalink
added submodule in main repo
Browse files Browse the repository at this point in the history
  • Loading branch information
idfarbanecha committed Jul 12, 2021
1 parent 12a7f02 commit 7248898
Show file tree
Hide file tree
Showing 12 changed files with 479 additions and 4 deletions.
3 changes: 0 additions & 3 deletions .gitmodules

This file was deleted.

1 change: 0 additions & 1 deletion mess/assembly_finder
Submodule assembly_finder deleted from 52c178
13 changes: 13 additions & 0 deletions mess/assembly_finder/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import os
include: 'rules/find_assemblies.rules'
community_name=config['community_name']
def downloaded_list(wildcards):
checkpoint_output = checkpoints.download_assemblies.get(**wildcards).output[0]
directory = '/'.join((checkpoint_output.split('/')[0:2]))
return expand(f'assembly_gz/{community_name}/{{i}}_genomic.fna.gz',
i=glob_wildcards(os.path.join(directory, '{i}_genomic.fna.gz')).i)

rule all_download:
input: f"{community_name}-assemblies-summary.tsv",
downloaded_list,
f"assembly_gz/{community_name}/{community_name}.done"
9 changes: 9 additions & 0 deletions mess/assembly_finder/envs/Assembly_finder.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: Assembly_finder
channels:
- bioconda
- conda-forge

dependencies:
- biopython = 1.78
- pandas = 1.2.2
- ete3 = 3.1.2
5 changes: 5 additions & 0 deletions mess/assembly_finder/envs/download.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
name: download
channels:
- hcc
dependencies:
- aspera-cli = 3.9.1
27 changes: 27 additions & 0 deletions mess/assembly_finder/envs/singularity/aspera.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
FROM continuumio/miniconda3:4.7.12


################## METADATA ######################

LABEL base.image="miniconda3:4.7.12"
LABEL version="v.1.0"
LABEL software="aspera"
LABEL software.version="3.9.1"
LABEL description="IBM aspera CLI (https://downloads.asperasoft.com/en/documentation/62) but install from a Conda environment "
LABEL tags="Genomics"


################## INSTALLATION ######################
ENV DEBIAN_FRONTEND noninteractive

COPY ./envs/download.yml ./download.yml

RUN conda update conda && \
conda env create -f download.yml && \
conda clean --all --yes


RUN conda init bash
ENTRYPOINT ["/bin/bash"]
ENV PATH /opt/conda/envs/download/bin:$PATH
ENV CONDA_PREFIX "/opt/conda/envs/download"
30 changes: 30 additions & 0 deletions mess/assembly_finder/envs/singularity/assembly_finder.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
FROM continuumio/miniconda3:4.7.12


################## METADATA ######################

LABEL base.image="miniconda3:4.7.12"
LABEL version="v.1.1"
LABEL software="assembly_finder"
LABEL tags="Genomics"

################## MAINTAINER ######################

MAINTAINER Valentin Scherz

################## INSTALLATION ######################
ENV DEBIAN_FRONTEND noninteractive

COPY ./envs/Assembly_finder.yml ./Assembly_finder.yml

RUN conda config --add channels defaults && \
conda config --add channels bioconda && \
conda config --add channels conda-forge && \
conda update conda && \
conda env create -f Assembly_finder.yml && \
conda clean --all --yes


RUN conda init bash
ENTRYPOINT ["/bin/bash"]
ENV PATH /opt/conda/envs/Assembly_finder/bin:$PATH
248 changes: 248 additions & 0 deletions mess/assembly_finder/rules/assembly_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
from Bio import Entrez
import pandas as pd
import warnings
import numpy as np
import logging
from ete3 import NCBITaxa
ncbi = NCBITaxa()


class AssemblyFinder:
def __init__(self, name, isassembly=False, genbank=False, refseq=True, representative=True, reference=True,
complete=True,
exclude_metagenomes=True, nb=1, rank_to_select='None', outf='f.tsv', outnf='nf.tsv'):
self.name = name
self.assembly = isassembly
self.genbank = genbank
self.refseq = refseq
self.representative = representative
self.reference = reference
self.complete = complete
self.exclude_metagenomes = exclude_metagenomes
self.target_ranks = ['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']
self.nchunks = 10000
self.rank_to_select = rank_to_select
self.nb = nb
self.outf = outf
self.outnf = outnf
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', datefmt='%d %b %Y %H:%M:%S',
filename=snakemake.log[0], level=logging.DEBUG)

# Static methods to apply functions on assembly summary table
def get_stat(self, meta, stat):
"""
function to extract assembly Meta stats (contig count, assembly length)
"""
return meta.split(f' <Stat category="{stat}" sequence_tag="all">')[1].split('</Stat>')[0]

def get_names(self, gbftp):
"""
function to extract assembly file names
"""
return gbftp.split('/')[-1]

def get_lin_tax(self, lineages):
"""
function to get lineages from a list of taxids
"""
ranks = ncbi.get_rank(lineages).values()
ranknames = ncbi.get_taxid_translator(lineages).values()
return dict(zip(ranks, ranknames))

def replace_nans(self, tb):
"""
function to replace unknown taxonomic rank with placeholder names
"""
tb = tb.replace(np.nan, 'unknown')
for i in range(len(tb)):
for n, col in enumerate(tb.columns):
if tb.iloc[i, n] == 'unknown' and col != 'superkingdom':
tmpname = tb.iloc[i, n - 1] + '_' + col[0]
if col == 'species':
tmpname = tb.iloc[i, n - 1] + '_' + col[0:1]
tb.iloc[i, n] = tmpname
return tb

def chunks(self, ls, n):
"""
function to split assembly list into chunks
"""
return [ls[i:i + n] for i in range(0, len(ls), n)]

def taxid_find(self):
"""
Function to parse input_list to convert scientific names to taxid
returns dictionary with taxid found
"""
logging.info(f'> Searching for taxIDs {self.name} ...')
try:
int(self.name)
logging.info('Query is a taxID')
taxid = self.name

except ValueError:
logging.warning('Query is not a taxID, enter taxID to be more precise')
logging.info(f'Search term: {self.name}[all Names]')
taxid_list = Entrez.read(Entrez.esearch(db='taxonomy', term=f'{self.name}[all Names]', retmax=100))[
'IdList']
if len(taxid_list) == 1:
taxid = taxid_list[0]
logging.info(f'TaxID:{taxid} found')
if len(taxid_list) > 1:
taxid = taxid_list[0]
logging.warning(f'{len(taxid_list)} TaxIDs found, change query (taking first one : {taxid})')
if len(taxid_list) == 0:
raise Exception('TaxID not found! Change search term!')
return taxid

def search_assemblies(self):
if self.assembly: # If the input is an assembly name or Gbuid use it as a search term
search_term = f'{self.name}'
else: # If not, search ncbi taxonomy for the taxid
taxid = self.taxid_find()
search_term = f'txid{taxid}[Organism:exp] '
if self.refseq and not self.genbank:
search_term += 'AND ("latest refseq"[filter] '
if self.genbank and not self.refseq:
search_term += 'AND ("latest genbank"[filter] '
if self.genbank and self.refseq:
search_term += 'AND (latest[filter] '
if self.complete and not self.representative and not self.reference:
search_term += 'AND "complete genome"[filter] '
if self.complete and self.representative and not self.reference:
search_term += 'AND "complete genome"[filter] OR "representative genome"[filter] '
if self.complete and self.representative and self.reference:
search_term += 'AND "complete genome"[filter] OR "representative genome"[filter] OR ' \
'"reference genome"[filter] '
if self.representative and not self.reference:
search_term += 'AND "representative genome"[filter] '
if self.reference and not self.representative:
search_term += 'AND "reference genome"[filter] '
if self.representative and self.reference:
search_term += 'AND "representative genome"[filter] OR "reference genome"[filter] '
if self.exclude_metagenomes:
search_term += 'AND all[filter] NOT metagenome[filter])'
assembly_ids = Entrez.read(Entrez.esearch(db='assembly', term=search_term, retmax=500000))['IdList']
logging.info(f'> Search term: {search_term}')
logging.info(f'found {len(assembly_ids)} assemblies')
if not assembly_ids:
raise Exception('No assemblies found ! Change search term!')
return assembly_ids

def generate_assembly_table(self, assemblies):
assembly_list = ','.join(assemblies)
assembly_summary = Entrez.read(Entrez.esummary(db='assembly', id=assembly_list), validate=False)
summaries = assembly_summary['DocumentSummarySet']['DocumentSummary']
tb = pd.DataFrame.from_records(summaries)
columns = ['GbUid', 'RefSeq_category', 'AssemblyStatus', 'FtpPath_GenBank', 'FtpPath_RefSeq', 'Meta',
'AsmReleaseDate_GenBank', 'ContigN50', 'ScaffoldN50', 'Coverage', 'Taxid']
subset = tb[columns]
lens = subset.apply(lambda x: self.get_stat(x['Meta'], stat='total_length'), axis=1)
contigs = subset.apply(lambda x: self.get_stat(x['Meta'], stat='contig_count'), axis=1)
subset.insert(loc=subset.shape[1] - 1, value=lens, column='Assembly_length')
subset.insert(loc=subset.shape[1] - 1, value=contigs, column='Contig_count')
subset.insert(loc=1, value=subset['FtpPath_GenBank'].apply(self.get_names), column='AssemblyNames')
subset = subset.rename(columns={'Coverage': 'Assembly_coverage'})
subset = subset.drop('Meta', axis=1)
return subset

def add_lineage(self, assembly_tb):
unique_taxids = list(set(assembly_tb['Taxid']))
taxid2lineage = ncbi.get_lineage_translator(unique_taxids)
tax = {taxid: self.get_lin_tax(lineage) for taxid, lineage in taxid2lineage.items()}
lineage_tb = pd.DataFrame.from_dict(tax, orient='index')
lineage_tb.index.set_names('Taxid', inplace=True)
lineage_tb.reset_index(inplace=True)
ordered_ranks = self.target_ranks[::-1]
ordered_ranks.append('Taxid')
lin_cols = list(lineage_tb.columns)
all_cols = list(set().union(lin_cols, ordered_ranks))
lineage_tb = lineage_tb.reindex(columns=all_cols, fill_value=np.nan)
lineage_tb = lineage_tb[ordered_ranks]
lineage_tb = self.replace_nans(lineage_tb)
lineage_tb = lineage_tb.astype({'Taxid': 'string'})
merged_table = assembly_tb.merge(lineage_tb, on='Taxid')
return merged_table

def select_assemblies(self, table):
fact_table = table.replace({'RefSeq_category': {'reference genome': 0, 'representative genome': 1, 'na': 6},
'AssemblyStatus': {'Complete Genome': 2, 'Chromosome': 3, 'Scaffold': 4,
'Contig': 5, 'na': 6}})
sorted_table = fact_table.sort_values(['RefSeq_category', 'AssemblyStatus', 'Contig_count',
'ScaffoldN50', 'ContigN50', 'AsmReleaseDate_GenBank'],
ascending=[True, True, True, False, False, False])
if self.rank_to_select != 'None':
logging.info(f'Filtering according to {self.rank_to_select}, Refseq categories, assembly status, '
f'contig count and release date')
select_index = []
unique_list = list(set(sorted_table[self.rank_to_select]))
if len(unique_list) > 1:
for i in unique_list:
select_index.append(sorted_table[sorted_table[self.rank_to_select] == i].sample(1).index[0])
# randomly select one assembly ID for each unique selected rank (species for example)
sorted_table = sorted_table.loc[select_index, :]
if len(unique_list) == 1:
logging.info(f'Same {self.rank_to_select} for all assemblies, Filtering according to Refseq '
f'categories, assembly status,contig count and release date')
if len(unique_list) == 0:
logging.error(f'{self.rank_to_select} is not a target rank')
else:
logging.info('No taxonomic rank specified, sorting according to Refseq category, '
'assembly status, contig count and release date')
if len(sorted_table) >= self.nb:
logging.info(f'Selecting {self.nb} sorted assemblies out of {len(sorted_table)}')
sorted_table = sorted_table[0:self.nb]
if len(sorted_table) < self.nb:
logging.warning(f'Found less than {self.nb} assemblies in total, returning {len(sorted_table)} instead')
sorted_table = sorted_table.replace({'RefSeq_category': {0: 'reference genome', 1: 'representative genome',
6: 'na'},
'AssemblyStatus': {2: 'Complete Genome', 3: 'Chromosome', 4: 'Scaffold',
5: 'Contig', 6: 'na'}})
return sorted_table

def run(self):
assemblies_found = self.search_assemblies()
if len(assemblies_found) > self.nchunks:
warnings.warn(f'{len(assemblies_found)} assemblies found, restrict search term to find less assemblies')
assemblies_chunks = self.chunks(assemblies_found,
self.nchunks) # Divide assembly lists by chunks of 10000
logging.info(f'Parsing assemblies by chucks of {self.nchunks}')
table_chunks = []
for n, chunk in enumerate(assemblies_chunks):
logging.info(f'chunk n°{n}')
assembly_tb = self.generate_assembly_table(chunk)
tb = self.add_lineage(assembly_tb)
table_chunks.append(tb)
non_filtered_tb = pd.concat(table_chunks, sort=False)
else:
assembly_tb = self.generate_assembly_table(assemblies_found)
non_filtered_tb = self.add_lineage(assembly_tb)

non_filtered_tb.to_csv(self.outnf, sep='\t', index=None)
filtered_tb = self.select_assemblies(non_filtered_tb)
filtered_tb.to_csv(self.outf, sep='\t', index=None)
return filtered_tb


'''
Main
'''
Entrez.email = snakemake.params['ncbi_email']
Entrez.api_key = snakemake.params['ncbi_key']
comp = snakemake.params['comp']
ref = snakemake.params['ref']
rep = snakemake.params['rep']
met = snakemake.params['met']
gb = snakemake.params['gb']
rs = snakemake.params['rs']
entry = snakemake.wildcards.entry
assembly = snakemake.params['assembly']
column = snakemake.params['column']
rank = snakemake.params['rank_filter']
intb = pd.read_csv(snakemake.input[0], sep='\t', dtype={f'{column}': 'str'})
intb.set_index(f'{column}', inplace=True)
nb = int(intb.loc[entry]['nb_genomes'])
find_assemblies = AssemblyFinder(name=entry, isassembly=assembly, genbank=gb, refseq=rs, representative=rep,
reference=ref, complete=comp, exclude_metagenomes=met, nb=nb, rank_to_select=rank,
outnf=snakemake.output.all, outf=snakemake.output.filtered)
find_assemblies.run()
13 changes: 13 additions & 0 deletions mess/assembly_finder/rules/combine_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import pandas as pd
"""
Main
"""
column = snakemake.params['column']
df_list = []
for file in snakemake.input:
entry = file.split('/')[1].split('-filtered.tsv')[0]
tb = pd.read_csv(file, sep='\t')
tb.insert(loc=0, column=f'{column}', value=[entry]*len(tb))
df_list.append(tb)
df = pd.concat(df_list, sort=False)
df.to_csv(snakemake.output[0], sep='\t', index=None)
15 changes: 15 additions & 0 deletions mess/assembly_finder/rules/concat-ftp-links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import pandas as pd
"""
Main
"""
ftplinks = pd.read_csv(snakemake.input[0], sep='\t')['FtpPath_GenBank']
links = []
for link in ftplinks:
link = link.replace('ftp://ftp.ncbi.nlm.nih.gov', '')
fna = '/' + link.split('/')[-1]+'_genomic.fna.gz\n'
link += fna
links.append(link)

f = open(snakemake.output[0], "w")
f.writelines(links)
f.close()
Loading

0 comments on commit 7248898

Please sign in to comment.