-
Notifications
You must be signed in to change notification settings - Fork 81
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding Kraken2 metagenomics classifier #355
Changes from 15 commits
a873d76
ba12181
1d34fd4
fdc7f0c
017290e
b91fe69
ec6b341
4186394
9c656e5
4bd6715
9e81c98
5223d8f
f3c49a8
287f214
7ea2790
ba9ed15
3e15c93
ace02e1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,78 @@ | ||||||
#!/usr/bin/env python | ||||||
|
||||||
|
||||||
import argparse | ||||||
import csv | ||||||
|
||||||
|
||||||
def _get_args(): | ||||||
'''This function parses and return arguments passed in''' | ||||||
parser = argparse.ArgumentParser( | ||||||
prog='kraken_parse', | ||||||
formatter_class=argparse.RawDescriptionHelpFormatter, | ||||||
description='Parsing kraken') | ||||||
parser.add_argument('krakenReport', help="path to kraken report file") | ||||||
parser.add_argument( | ||||||
'-c', | ||||||
dest="count", | ||||||
default=50, | ||||||
help="Minimum number of hits on clade to report it. Default = 50") | ||||||
parser.add_argument( | ||||||
'-o', | ||||||
dest="output", | ||||||
default=None, | ||||||
help="Output file. Default = <basename>.kraken_parsed.csv") | ||||||
|
||||||
args = parser.parse_args() | ||||||
|
||||||
infile = args.krakenReport | ||||||
countlim = int(args.count) | ||||||
outfile = args.output | ||||||
|
||||||
return(infile, countlim, outfile) | ||||||
|
||||||
|
||||||
def _get_basename(file_name): | ||||||
if ("/") in file_name: | ||||||
basename = file_name.split("/")[-1].split(".")[0] | ||||||
else: | ||||||
basename = file_name.split(".")[0] | ||||||
return(basename) | ||||||
|
||||||
|
||||||
def parse_kraken(infile, countlim): | ||||||
''' | ||||||
INPUT: | ||||||
infile (str): path to kraken report file | ||||||
countlim (int): lower count threshold to report hit | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed |
||||||
OUTPUT: | ||||||
resdict (dict): key=taxid, value=readCount | ||||||
|
||||||
''' | ||||||
with open(infile, 'r') as f: | ||||||
resdict = {} | ||||||
csvreader = csv.reader(f, delimiter='\t') | ||||||
for line in csvreader: | ||||||
reads = int(line[1]) | ||||||
if reads >= countlim: | ||||||
taxid = line[4] | ||||||
resdict[taxid] = reads | ||||||
return(resdict) | ||||||
|
||||||
|
||||||
def write_output(resdict, infile, outfile): | ||||||
with open(outfile, 'w') as f: | ||||||
basename = _get_basename(infile) | ||||||
f.write(f"TAXID,{basename}\n") | ||||||
for akey in resdict.keys(): | ||||||
f.write(f"{akey},{resdict[akey]}\n") | ||||||
|
||||||
|
||||||
if __name__ == '__main__': | ||||||
INFILE, COUNTLIM, outfile = _get_args() | ||||||
|
||||||
if not outfile: | ||||||
outfile = _get_basename(INFILE)+".kraken_parsed.csv" | ||||||
|
||||||
tmp_dict = parse_kraken(infile=INFILE, countlim=COUNTLIM) | ||||||
write_output(resdict=tmp_dict, infile=INFILE, outfile=outfile) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
#!/usr/bin/env python | ||
|
||
import argparse | ||
import os | ||
import pandas as pd | ||
import numpy as np | ||
|
||
|
||
def _get_args(): | ||
'''This function parses and return arguments passed in''' | ||
parser = argparse.ArgumentParser( | ||
prog='merge_kraken_res', | ||
formatter_class=argparse.RawDescriptionHelpFormatter, | ||
description='Merging csv count files in one table') | ||
parser.add_argument( | ||
'-o', | ||
dest="output", | ||
default=None, | ||
help="Output file. Default = sources.csv") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does 'sources' mean here? Or is this a leftover from CoproID? Maybe should be changed to same on line 58? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From kraken-nf actually ;) |
||
|
||
args = parser.parse_args() | ||
|
||
outfile = args.output | ||
|
||
return(outfile) | ||
|
||
|
||
def get_csv(): | ||
tmp = [i for i in os.listdir() if ".csv" in i] | ||
return(tmp) | ||
|
||
|
||
def _get_basename(file_name): | ||
if ("/") in file_name: | ||
basename = file_name.split("/")[-1].split(".")[0] | ||
else: | ||
basename = file_name.split(".")[0] | ||
return(basename) | ||
|
||
|
||
def merge_csv(all_csv): | ||
df = pd.read_csv(all_csv[0], index_col=0) | ||
for i in range(1, len(all_csv)): | ||
df_tmp = pd.read_csv(all_csv[i], index_col=0) | ||
df = pd.merge(left=df, right=df_tmp, on='TAXID', how='outer') | ||
df.fillna(0, inplace=True) | ||
return(df) | ||
|
||
|
||
def write_csv(pd_dataframe, outfile): | ||
pd_dataframe.to_csv(outfile) | ||
|
||
|
||
if __name__ == "__main__": | ||
OUTFILE = _get_args() | ||
all_csv = get_csv() | ||
resdf = merge_csv(all_csv) | ||
write_csv(resdf, "kraken_otu_table.csv") | ||
print(resdf) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
/* | ||
* ------------------------------------------------- | ||
* Nextflow config file for running tests | ||
* ------------------------------------------------- | ||
* Defines bundled input files and everything required | ||
* to run a fast and simple test. Use as follows: | ||
* nextflow run nf-core/eager -profile test, docker (or singularity, or conda) | ||
*/ | ||
|
||
params { | ||
config_profile_name = 'Test profile kraken' | ||
config_profile_description = 'Minimal test dataset to check pipeline function with kraken metagenomic profiler' | ||
// Limit resources so that this can run on Travis | ||
max_cpus = 2 | ||
max_memory = 6.GB | ||
max_time = 48.h | ||
genome = false | ||
//Input data | ||
single_end = false | ||
metagenomic_tool = 'kraken' | ||
run_metagenomic_screening = true | ||
readPaths = [['JK2782_TGGCCGATCAACGA_L008', ['https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2782_TGGCCGATCAACGA_L008_R1_001.fastq.gz.tengrand.fq.gz','https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2782_TGGCCGATCAACGA_L008_R2_001.fastq.gz.tengrand.fq.gz']], | ||
['JK2802_AGAATAACCTACCA_L008', ['https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2802_AGAATAACCTACCA_L008_R1_001.fastq.gz.tengrand.fq.gz','https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2802_AGAATAACCTACCA_L008_R2_001.fastq.gz.tengrand.fq.gz']], | ||
] | ||
// Genome references | ||
fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Mammoth/Mammoth_MT_Krause.fasta' | ||
database = 'https://github.com/nf-core/test-datasets/raw/eager/databases/kraken/eager_test.tar.gz' | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changed