Skip to content

Commit

Permalink
true read counts from shuffled fastq.gz instead
Browse files Browse the repository at this point in the history
  • Loading branch information
idfarbanecha committed Jul 6, 2021
1 parent 6c6d14e commit 917eec6
Showing 1 changed file with 13 additions and 11 deletions.
24 changes: 13 additions & 11 deletions mess/scripts/true-read-counts.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,25 @@
import gzip
from Bio import SeqIO
import pandas as pd
from collections import Counter
seqtech = snakemake.params[0]


def get_read_counts(fastq, ss):
def get_read_counts(fastq_gz, ss):
"""
function that returns a table with each genome read count
"""
fastq_parser = SeqIO.parse(fastq, 'fastq')
headers = [record.description for record in fastq_parser]
if ss == 'illumina':
headers = [header.split('-')[-1].split('.')[0] for header in headers]
else:
headers = [header.split('.')[0] for header in headers]
c = dict(Counter(headers))
rc = pd.DataFrame.from_dict(c, orient='index', columns=['true_read_counts'])
rc.index.set_names('Assembly', inplace=True)
rc.reset_index(inplace=True)
with gzip.open(fastq_gz, 'rt') as handle:
fastq_parser = SeqIO.parse(handle, 'fastq')
headers = [record.description for record in fastq_parser]
if ss == 'illumina':
headers = [header.split('-')[-1].split('.')[0] for header in headers]
else:
headers = [header.split('.')[0] for header in headers]
c = dict(Counter(headers))
rc = pd.DataFrame.from_dict(c, orient='index', columns=['true_read_counts'])
rc.index.set_names('Assembly', inplace=True)
rc.reset_index(inplace=True)
return rc


Expand Down

0 comments on commit 917eec6

Please sign in to comment.