Skip to content

Commit

Permalink
Merge pull request #12 from NBISweden/develop
Browse files Browse the repository at this point in the history
Version 0.5.10
  • Loading branch information
johnne authored May 12, 2021
2 parents 9b71c40 + 1e64ceb commit cdb5168
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 4 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: [3.5, 3.6, 3.7, 3.8]
python-version: [3.6, 3.7, 3.8, 3.9]
os: [ubuntu-latest, macos-latest]
needs: download_taxonomy
steps:
Expand Down
40 changes: 40 additions & 0 deletions contigtax/assign.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,33 @@ def propagate_lower(x, taxid, ranks):
right_index=True)


def unique_cols(x):
"""
Checks the columns for a taxid entry and only returns unique indices
This is meant to fix the problem of some taxids having multiple entries
for the same taxonomic rank.
Example: Alouatta palliata mexicana (Howler monkey)
taxid = 182248
superkingdom phylum order genus species class family class
2759 7711 9443 9499 30589 40674 378855 1338369
This function would return indices for columns to use with 'iloc' in order
to get:
superkingdom phylum order genus species class family
2759 7711 9443 9499 30589 40674 378855
:param x:
:return:
"""
col_index = []
cols = []
for i, c in enumerate(x.columns):
if c not in cols:
cols.append(c)
col_index.append(i)
return col_index


def get_lca(r, assignranks, reportranks):
"""
Assign lowest common ancestor from a set of taxids.
Expand Down Expand Up @@ -508,6 +535,8 @@ def process_lineages(items):
x.columns = x.loc["rank"]
x.drop("rank", inplace=True)
x.index = [taxid]
# Only select unique columns
x = x.iloc[:, unique_cols(x)]
# Add taxids for lower ranks in the hierarchy
x = propagate_lower(x, taxid, ranks)
# Add names for taxids
Expand Down Expand Up @@ -543,6 +572,15 @@ def make_name_dict(df, ranks):
return name_dict


def exit_no_info(taxdir):
sys.exit(f"""
ERROR: No taxonomic information found in the database at {taxdir}
This may be caused by an out-of-date or incomplete taxonomy database.
Try running:
contigtax download taxonomy -f -t {taxdir}
to force an update.
""")

def make_lineage_df(taxids, taxdir, dbname, ranks, cpus=1):
"""
Creates a lineage dataframe with full taxonomic information for a list of
Expand Down Expand Up @@ -583,6 +621,8 @@ def make_lineage_df(taxids, taxdir, dbname, ranks, cpus=1):
rename = {y: x for x, y in translate_dict.items()}
# Update lineages with missing taxids
lineages.update(ncbi_taxa.get_lineage_translator(translate_dict.values()))
if len(lineages.keys()) == 0:
exit_no_info(taxdir)
items = [[taxid, ranks, taxdir, dbname, lineages[taxid]] for taxid in
list(lineages.keys())]
with Pool(processes=cpus) as pool:
Expand Down
4 changes: 2 additions & 2 deletions environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ channels:
- bioconda
- defaults
dependencies:
- python
- python>=3.6
- diamond>=0.8.37, <=0.9.24
- ete3
- biopython
- pandas
- tqdm
- tqdm>4.7.2
- numpy
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="contigtax",
version="0.5.9",
version="0.5.10",
author="John Sundh",
author_email="john.sundh@scilifelab.se",
description="A package to assign taxonomy to metagenomic contigs",
Expand Down
11 changes: 11 additions & 0 deletions tests/data/shreds.fasta
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
>182248|contig1001
CCCCGCAAAACTCACCCACTAGCAAAAATCATCAACAATTCATTCATTGACCTCCCTACAC
CATCCAACATCTCCGCCTGGTGAAATTTCGGCTCACTCCTAGGTATTTGCCTAATTATTCAAATCACTAC
AGGTCTATTCTTAGCCATACACTATACACCAGACACTTCAACCGCCTTCTCCTCAGTCGCCCACATCACC
CGAGACGTCAACTATGGCTGAATAATCCGCTATCTACATGCCAACGGCGCCTCCATATTCTTTATCTGCC
TCTTTCTCCACATTGGCCGAGGCTTATATTACGGATCATTCCTTTTTCTGGAGACCTGGAACGTCGGTAT
TATCCTCCTACTCACAACCATAGCCACAGCATTCATAGGCTATGTCCTCCCATGAGGCCAAATATCATTC
TGAGGGGCCACAGTAATTACAAACCTTCTGTCAGCCATTCCATATATCGGGTCTGACCTCGTACAATGAA
TCTGAGGTGGCTTCTCAGTAGATAAAGCCACCCTCACACGATTTTTCACCTTTCACTTTATCTTGCCATT
TATTATCGCTGCCCTAGCAACCATCCACCTCTTGTTTCTGCATGAAACAGGATCAAGTAACCCATCAGGA
GTAGCATCGGACCTCGACAAAATTACATTTCACCCCTACT
>1097667|contig0
CCCGGGTCGCCCGGCGCGTCGACCTGCAGGACCAGATCGGCGTCGGCGGCCCACGAGCGGCCGGCGACCGCGGCCGGCAGATCGAGGATCCGCAGCCACAGCGCGTCGCCCTGCTGGATCGTCCGCATGGCCCGCGGATCGGCCGGCGCCAGCTCCAGCTCGTCGTCGAGCGGTCGCTCGTGGAGCTCGACCCGGGTCGCCAGGTCGATGCTCGTCAGGAAGGCGAGCAGGGCCCGGCCGGCCTCCGGCGTGGCCGCGATCAGCTCCTGGACCTCGACGGTGGTCTGGCCGATCCCGCCGACCGGACCGCCGGCCACGTCCTCGTGGACGCGGTAGAGCGCGTAGCCGTCCTCGCCGCACGCGACGAGCCGCTTGGGCGCCCCGTCGGCCGGTCGCTCGTCCTGTGCGTCGCTGAGGATCCGGGTCGTCCACCAGGCCGGCGGGCGCGACATGATCCCCGCGCGCTGCCGGCGGGCGCGGTCGAAGATCGCCGCGGTCAGCGGCGCCGCCCCGGCCAGATCGACGAGCCGCAGCGACGGATCGGCGACCGGCGACCGCAGCGGCACCCCGCCCGCCAGGTCGATCCGACTGCGGCGCGTCCAGCTGGCCGGCCCGAAGCCGAACCGCCCGTAGATCCGGCCCTCGGACGCCCACAGCGCCGCCAGCGGCTCCCCCCGCTCGCGCGCGTCGGCGACCAGCGCCTTCATCATCCGCCGCAGGATCCCGCGCCGGCGGTGGGTCGGCAGGACGGTGACGAGCGTGATCGCGGCGCACGGCAGCTCCCCGCCGGGGACCGTCAGCCGCCAGCCCCACGCCGACGAGGTGCCGACGACCTGGCCGTCGTCGAGCGCGATCCGACTGCGGTCGAGCTCGAACCGGTCCCGCAGCCGCAGCCGGGACGCATCGCTGCGATCGCCGTGGAAGGCGATCGCAGCGGCGTCCAGCAGCGCGCCGAACTCGTCCTCGGTGGCCGCACGCACGGCCAGGCGATCGCTCATCGCTCGACCCTATCGCCCGCCGGACGGCTGCCGGCCGCGACGCGAAATCACCGTCCGCACGGGGCCCGCGGCGCCCGCCTTCTACGCTGACGCGATGGCCGAGACCGCCCCGCCCACCCGACCGACGCAGGTGCTCGTCGTCCTGGACGCCTACTGCGGCTGGTGCCACGGGTTCCGCGACGCGTTGCTCGGCTTCTGGGAGCGCCATCGCGAGGACCACGAGTTCGTCGTCCTCGCCGGCGGCCTGGTCACCGGCGATCGGGTCGCGCCGATCGCGAGCTTCGACTTCATCCCCGACGGCAACCGGCGGATCGCCGAGCTGACCGGGGCCCGCTTCGGCGAGCCCTACCTGGAGCTGCTGGCCGACGGCTCGCTGGTGATCGACTCGACCGACGCCGCCCGCGGATTCACCGCGCTGCGCGCCCAGGCCCCCGAGCGCTCGGTCCCGCTGGCGGTCGCGATCGCCGACGCCTTCTTCCACGACGGGCGGTCGCTGTCGGACGTCGCGACCTTCCGCGCCGTCGCCGTCGACCAGGGGCTCGATCCCGACGCCGTCGAGGCGGCCTTCGAGGACCCCGCGACCGCCGCCCAGGTGGCCGAGGAGTTCGCCGCCGTCTCGCGGATCGGCGTCAGCGGCTTCCCGACGGTCGCCGTCAGCCACGGCGACCACCTGCACGCGATCGCCGTCGGCTACGCGACCGGCGAAGAGCTGGAGCAGCGCCTGGCCGCGGCGGCCCAGGCCCACTGATCGCGTCGTCGGTCGGGGCGATCCCCGACCGCGGACGGCGCCGGATCCTCCTCTCGGCGGAGGCCGCCCGCTGCCCGGCGCGTCAGACTGGGGACGATGGCTGACGCGCGCCCGGCTGCGGGTACCACCCTCGCCCGCCTGCTGGTGGCCGCCGGCGTCGCGCTGGCCCTCGCCGACGCGTCGGTGGTGACCCTGGCGCTGCCCCAGATCCTGACCCGGCTCGACACGTCGATCGACGGCGTCGCCGCGGTGATCGGCGTCTACACGGTGGTCCTCGCGGCGGCCGTGCTGGTGGCCGTGCCGCTGCGCCGGCGGCTCGGCAGCGCGGCGCTGGGAGCGGCCGGGATGCTGCTGTTCGCGCTGGCCGGCGCCGCCTGCGGGCTGGCGGAGTCGCTGGAGCTGCTGCTGGCCGCCCGCGCCGCCCAGGCGATCGGGGCCGGGGCCGGCCTGGTCGCCGGCTACTCGCTGCTGCACCGGACGCGCCAGATCGGCCAGCGGGCGGACGCGCTCTGGATCGCCGGCGCGGTCTTCGGCACGGCCGTCGGGCCGGCGCTCGGCGGGGCGCTGACGGAGCTCTT
>1097667|contig1
Expand Down
1 change: 1 addition & 0 deletions tests/data/shreds.taxids.tsv
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
182248|contig1001 182248
1097667|contig0 1097667
1097667|contig1 1097667
1097667|contig10 1097667
Expand Down
6 changes: 6 additions & 0 deletions tests/data/uniref100.fasta
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
>UniRef100_Q8HFX4 Complex III subunit 3 (Fragment) OS=Alouatta palliata palliata OX=182250 GN=cytb PE=4 SV=1 TaxID=182248
MTTPRKTHPLAKIINNSFIDLPTPSNISAWWNFGSLLGICLIIQITTGLFLAMHYTPDTS
TAFSSVAHITRDVNYGWMIRYLHANGASMFFICLFLHIGRGLYYGSFLFLETWNVGIILL
LTTMATAFMGYVLPWGQMSFWGATVITNLLSAIPYIGSDLVQWIWGGFSVDKATLTRFFT
FHFILPFIIAALATIHLLFLHETGSSNPSGVASDLDKITFHPYYTTKDILGLIILLLCLM
SLTLFLPDLLTDPDNYTLANPLNTPP
>UniRef100_T0LMY7 Bifunctional NAD(P)H-hydrate repair enzyme n=1 Tax=Thermoplasmatales archaeon I-plasma TaxID=667138 RepID=T0LMY7_9EURY
MHTILDSRILDANAEAAGVNMEELMNNAGNAVAKLVMSLEPKRVLVACGSGNNGGDGYVA
ATTLKREGISVTCYPVSPPATYLSKKKYAAYLKNKGRVVHSMKEGSYDVIIDALLGVGIS
Expand Down

0 comments on commit cdb5168

Please sign in to comment.