Skip to content

Commit

Permalink
This should provide a fix for #159 (also implements the test as per #137
Browse files Browse the repository at this point in the history
)
  • Loading branch information
lucventurini committed Mar 21, 2019
1 parent 0cd5da7 commit 0c2cc18
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 42 deletions.
51 changes: 16 additions & 35 deletions Mikado/subprograms/util/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,52 +540,33 @@ def __store_gene(self, gene):
gene.transcripts[tid].selected_cds_length] = self.__stores["cds_lengths_coding"].get(
gene.transcripts[tid].selected_cds_length, 0) + 1

self.__stores["five_utr_lengths"].set(gene.transcripts[tid].five_utr_length,
self.__stores["five_utr_lengths"].get(
gene.transcripts[tid].five_utr_length, 0) + 1)

self.__stores["three_utr_lengths"].set(gene.transcripts[tid].three_utr_length,
self.__stores["three_utr_lengths"].get(
gene.transcripts[tid].three_utr_length, 0) + 1)

self.__stores["five_utr_nums"].set(gene.transcripts[tid].five_utr_num,
self.__stores["five_utr_nums"].get(
gene.transcripts[tid].five_utr_num, 0) + 1)

self.__stores["three_utr_nums"].set(gene.transcripts[tid].three_utr_num,
self.__stores["three_utr_nums"].get(
gene.transcripts[tid].three_utr_num, 0) + 1)
self.__stores["end_distance_from_junction"].set(
gene.transcripts[tid].selected_end_distance_from_junction,
self.__stores["end_distance_from_junction"].get(
gene.transcripts[tid].selected_end_distance_from_junction, 0) + 1
)
for key, attribute in (("five_utr_lengths", "five_utr_length"),
("three_utr_lengths", "three_utr_length"),
("five_utr_nums", "five_utr_num"),
("three_utr_nums", "three_utr_num"),
("end_distance_from_junction", "end_distance_from_junction")):
val = getattr(gene.transcripts[tid], attribute)
self.__stores[key][val] = self.__stores[key].get(val, 0) + 1

cds_ratio = 100 * __cds_length / __cdna_length
self.__stores["cds_ratio"].set(cds_ratio,
self.__stores["cds_ratio"].get(cds_ratio, 0) + 1)
self.__stores["cds_ratio"][cds_ratio] = self.__stores["cds_ratio"].get(cds_ratio, 0) + 1

if self.only_coding is False:
if gene.transcripts[tid].selected_cds_length > 0:
cdl = gene.transcripts[tid].cdna_length

self.__stores["cdna_lengths_coding"].set(
cdl,
self.__stores["cdna_lengths_coding"].get(cdl, 0) + 1)
self.__stores["cdna_lengths_coding"][cdl] = self.__stores["cdna_lengths_coding"].get(cdl, 0) + 1
for el in gene.transcripts[tid].exon_lengths:
self.__stores["exons_coding"].set(el, self.__stores["exons_coding"].get(el, 0) + 1)
self.__stores["exons_coding"][el] = self.__stores["exons_coding"].get(el, 0) + 1
num_exons = len(gene.transcripts[tid].exon_lengths)

self.__stores["exon_num_coding"].set(
num_exons,
self.__stores["exon_num_coding"].get(num_exons, 0) + 1)
self.__stores["exon_num_coding"][num_exons] = self.__stores["exon_num_coding"].get(num_exons,
0) + 1
cds_num_exons = len(gene.transcripts[tid].cds_exon_lengths)
self.__stores["cds_exon_num_coding"].set(
cds_num_exons,
self.__stores["cds_exon_num_coding"].get(cds_num_exons, 0) + 1)
self.__stores["cds_exon_num_coding"][cds_num_exons] = self.__stores[
"cds_exon_num_coding"].get(cds_num_exons,
0) + 1
for il in gene.transcripts[tid].intron_lengths:
self.__stores["introns_coding"].set(
il, self.__stores["introns_coding"].get(il, 0) + 1)
self.__stores["introns_coding"][il] = self.__stores["introns_coding"].get(il, 0) + 1
return

def __finalize_arrays(self):
Expand Down
2 changes: 1 addition & 1 deletion Mikado/tests/annotation.gff3
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Chr5 TAIR10 gene 1223 5061 . - . ID=AT5G01010;Note=protein_coding_gene;Name=AT5G01010
Chr5 TAIR10 gene 1251 5043 . - . ID=AT5G01010;Note=protein_coding_gene;Name=AT5G01010
Chr5 TAIR10 mRNA 1251 5043 . - . ID=AT5G01010.1;Parent=AT5G01010;Name=AT5G01010.1;Index=1
Chr5 TAIR10 five_prime_UTR 4925 5043 . - . Parent=AT5G01010.1
Chr5 TAIR10 CDS 4765 4924 . - 0 Parent=AT5G01010.1;
Expand Down
30 changes: 30 additions & 0 deletions Mikado/tests/annotation.gff3.stats
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
Stat Total Average Mode Min 1% 5% 10% 25% Median 75% 90% 95% 99% Max
Number of genes 2 NA NA NA NA NA NA NA NA NA NA NA NA NA
Number of genes (coding) 2 NA NA NA NA NA NA NA NA NA NA NA NA NA
Number of monoexonic genes 0 NA NA NA NA NA NA NA NA NA NA NA NA NA
Transcripts per gene 2 1.00 1 1 1 1 1 1 1 1 1 1 1 1
Coding transcripts per gene 2 1.00 1 1 1 1 1 1 1 1 1 1 1 1
CDNA lengths 3073 1,536.50 1500;1573 1,500 1,501 1,504 1,507 1,518 1,536 1,555 1,566 1,569 1,572 1,573
CDNA lengths (mRNAs) 3073 1,536.50 1500;1573 1,500 1,501 1,504 1,507 1,518 1,536 1,555 1,566 1,569 1,572 1,573
CDS lengths 2817 1,408.50 1317;1500 1,317 1,319 1,326 1,335 1,363 1,408 1,454 1,482 1,491 1,498 1,500
CDS lengths (mRNAs) NA 1,408.50 1317;1500 1,317 1,319 1,326 1,335 1,363 1,408 1,454 1,482 1,491 1,498 1,500
CDS/cDNA ratio NA 91.86 83.72536554354735;100.0 84 84 85 85 88 92 96 98 99 100 100
Monoexonic transcripts 0 NA NA NA NA NA NA NA NA NA NA NA NA NA
MonoCDS transcripts 0 NA NA NA NA NA NA NA NA NA NA NA NA NA
Exons per transcript 23 11.50 8;15 8 8 8 9 10 12 13 14 15 15 15
Exons per transcript (mRNAs) 22 11.50 8;15 8 8 8 9 10 12 13 14 15 15 15
Exon lengths NA 133.61 75 36 37 42 49 75 128 174 229 274 307 315
Exon lengths (mRNAs) NA 133.61 75 36 37 42 49 75 128 174 229 274 307 315
Intron lengths NA 134.67 87 72 73 75 76 85 96 124 238 368 452 473
Intron lengths (mRNAs) NA 134.67 87 72 73 75 76 85 96 124 238 368 452 473
CDS exons per transcript 22 11.50 8;15 8 8 8 9 10 12 13 14 15 15 15
CDS exons per transcript (mRNAs) 22 11.50 8;15 8 8 8 9 10 12 13 14 15 15 15
CDS exon lengths 2817 122.48 75 36 37 42 49 74 117 156 216 230 297 315
CDS Intron lengths 2807 133.67 86 71 72 74 75 84 95 123 237 367 451 472
5'UTR exon number 2 0.50 0;1 0 0 0 0 0 0 1 1 1 1 1
3'UTR exon number 2 0.50 0;1 0 0 0 0 0 0 1 1 1 1 1
5'UTR length 119 59.50 0;119 0 1 6 12 30 60 89 107 113 118 119
3'UTR length 137 68.50 0;137 0 1 7 14 34 68 103 123 130 136 137
Stop distance from junction NA 0.00 0 0 0 0 0 0 0 0 0 0 0 0
Intergenic distances NA 17,697.00 17697 17,697 17,697 17,697 17,697 17,697 17,697 17,697 17,697 17,697 17,697 17,697
Intergenic distances (coding) NA 17,697.00 17697 17,697 17,697 17,697 17,697 17,697 17,697 17,697 17,697 17,697 17,697 17,697
40 changes: 34 additions & 6 deletions Mikado/tests/test_system_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -1499,33 +1499,36 @@ def tearDownClass(cls):
os.remove(cls.__genomefile__.name)
os.remove(cls.fai.faidx.indexname)

@unittest.skip
#@unittest.skip
def test_subprocess_multi(self):

xml = pkg_resources.resource_filename("Mikado.tests", "chunk-001-proteins.xml.gz")
transcripts = pkg_resources.resource_filename("Mikado.tests", "mikado_prepared.fasta")
junctions = pkg_resources.resource_filename("Mikado.tests", "junctions.bed")
orfs = pkg_resources.resource_filename("Mikado.tests", "transcripts.fasta.prodigal.gff3")
uniprot = pkg_resources.resource_filename("Mikado.tests", "uniprot_sprot_plants.fasta.gz")
mobjects = 300 # Let's test properly the serialisation for BLAST
procs = 3

dir = tempfile.TemporaryDirectory()
json_file = os.path.join(dir.name, "mikado.yaml")
db = os.path.join(dir.name, "mikado.db")
log = os.path.join(dir.name, "serialise.log")
uni_out = os.path.join(dir.name, "uniprot_sprot_plants.fasta")
with gzip.open(uniprot, "rb") as uni, open(uni_out, "wb") as uni_out_handle:
uni_out_handle.write(uni.read())

with open(json_file, "wt") as json_handle:
sub_configure.print_config(yaml.dump(self.json_conf, default_flow_style=False),
json_handle)
# Set up the command arguments

sys.argv = [str(_) for _ in ["mikado", "serialise", "--json-conf", json_file,
"--transcripts", transcripts,
"--transcripts", transcripts, "--blast_targets", uni_out,
"--orfs", orfs, "--junctions", junctions, "--xml", xml,
"-p", procs, "-mo", mobjects, db, "--log", log]]

with self.assertRaises(SystemExit):
pkg_resources.load_entry_point("Mikado", "console_scripts", "mikado")()
pkg_resources.load_entry_point("Mikado", "console_scripts", "mikado")()

self.assertTrue(os.path.exists(db))
conn = sqlite3.connect(db)
Expand All @@ -1536,12 +1539,37 @@ def test_subprocess_multi(self):
self.assertEqual(cursor.execute("select count(distinct(query_id)) from hit").fetchall()[0][0], 71)
self.assertEqual(cursor.execute("select count(distinct(target_id)) from hsp").fetchall()[0][0], 32)
self.assertEqual(cursor.execute("select count(distinct(target_id)) from hit").fetchall()[0][0], 32)
self.assertEqual(cursor.execute("select count(*) from junctions").fetchall()[0][0], 371)
self.assertEqual(cursor.execute("select count(distinct(chrom_id)) from junctions").fetchall()[0][0], 1)
self.assertEqual(cursor.execute("select count(*) from junctions").fetchall()[0][0], 372)
self.assertEqual(cursor.execute("select count(distinct(chrom_id)) from junctions").fetchall()[0][0], 2)
self.assertEqual(cursor.execute("select count(*) from orf").fetchall()[0][0], 169)
self.assertEqual(cursor.execute("select count(distinct(query_id)) from orf").fetchall()[0][0], 81)
dir.cleanup()


class StatsTest(unittest.TestCase):

def test_annotation_stats(self):

annotation_file = pkg_resources.resource_filename("Mikado.tests", "annotation.gff3")
annotation_check = pkg_resources.resource_filename("Mikado.tests", "annotation.gff3.stats")

dir = tempfile.TemporaryDirectory()
out = os.path.join(dir.name, "annotation.gff3.stats")
sys.argv = [str(_) for _ in ["mikado", "util", "stats", annotation_file, out]]
# with self.assertRaises(SystemExit):
# pkg_resources.load_entry_point("Mikado", "console_scripts", "mikado")()
pkg_resources.load_entry_point("Mikado", "console_scripts", "mikado")()

self.assertTrue(os.path.exists(out))

with open(annotation_check) as check:
check_lines = [_.strip() for _ in check]

with open(out) as out_handle:
out_lines = [_.strip() for _ in out_handle]

self.assertEqual(check_lines, out_lines)


if __name__ == "__main__":
unittest.main()

0 comments on commit 0c2cc18

Please sign in to comment.