diff --git a/Mikado/subprograms/util/stats.py b/Mikado/subprograms/util/stats.py index 98017c40a..7e109fc22 100644 --- a/Mikado/subprograms/util/stats.py +++ b/Mikado/subprograms/util/stats.py @@ -540,52 +540,33 @@ def __store_gene(self, gene): gene.transcripts[tid].selected_cds_length] = self.__stores["cds_lengths_coding"].get( gene.transcripts[tid].selected_cds_length, 0) + 1 - self.__stores["five_utr_lengths"].set(gene.transcripts[tid].five_utr_length, - self.__stores["five_utr_lengths"].get( - gene.transcripts[tid].five_utr_length, 0) + 1) - - self.__stores["three_utr_lengths"].set(gene.transcripts[tid].three_utr_length, - self.__stores["three_utr_lengths"].get( - gene.transcripts[tid].three_utr_length, 0) + 1) - - self.__stores["five_utr_nums"].set(gene.transcripts[tid].five_utr_num, - self.__stores["five_utr_nums"].get( - gene.transcripts[tid].five_utr_num, 0) + 1) - - self.__stores["three_utr_nums"].set(gene.transcripts[tid].three_utr_num, - self.__stores["three_utr_nums"].get( - gene.transcripts[tid].three_utr_num, 0) + 1) - self.__stores["end_distance_from_junction"].set( - gene.transcripts[tid].selected_end_distance_from_junction, - self.__stores["end_distance_from_junction"].get( - gene.transcripts[tid].selected_end_distance_from_junction, 0) + 1 - ) + for key, attribute in (("five_utr_lengths", "five_utr_length"), + ("three_utr_lengths", "three_utr_length"), + ("five_utr_nums", "five_utr_num"), + ("three_utr_nums", "three_utr_num"), + ("end_distance_from_junction", "end_distance_from_junction")): + val = getattr(gene.transcripts[tid], attribute) + self.__stores[key][val] = self.__stores[key].get(val, 0) + 1 cds_ratio = 100 * __cds_length / __cdna_length - self.__stores["cds_ratio"].set(cds_ratio, - self.__stores["cds_ratio"].get(cds_ratio, 0) + 1) + self.__stores["cds_ratio"][cds_ratio] = self.__stores["cds_ratio"].get(cds_ratio, 0) + 1 if self.only_coding is False: if gene.transcripts[tid].selected_cds_length > 0: cdl = gene.transcripts[tid].cdna_length - - self.__stores["cdna_lengths_coding"].set( - cdl, - self.__stores["cdna_lengths_coding"].get(cdl, 0) + 1) + self.__stores["cdna_lengths_coding"][cdl] = self.__stores["cdna_lengths_coding"].get(cdl, 0) + 1 for el in gene.transcripts[tid].exon_lengths: - self.__stores["exons_coding"].set(el, self.__stores["exons_coding"].get(el, 0) + 1) + self.__stores["exons_coding"][el] = self.__stores["exons_coding"].get(el, 0) + 1 num_exons = len(gene.transcripts[tid].exon_lengths) - self.__stores["exon_num_coding"].set( - num_exons, - self.__stores["exon_num_coding"].get(num_exons, 0) + 1) + self.__stores["exon_num_coding"][num_exons] = self.__stores["exon_num_coding"].get(num_exons, + 0) + 1 cds_num_exons = len(gene.transcripts[tid].cds_exon_lengths) - self.__stores["cds_exon_num_coding"].set( - cds_num_exons, - self.__stores["cds_exon_num_coding"].get(cds_num_exons, 0) + 1) + self.__stores["cds_exon_num_coding"][cds_num_exons] = self.__stores[ + "cds_exon_num_coding"].get(cds_num_exons, + 0) + 1 for il in gene.transcripts[tid].intron_lengths: - self.__stores["introns_coding"].set( - il, self.__stores["introns_coding"].get(il, 0) + 1) + self.__stores["introns_coding"][il] = self.__stores["introns_coding"].get(il, 0) + 1 return def __finalize_arrays(self): diff --git a/Mikado/tests/annotation.gff3 b/Mikado/tests/annotation.gff3 index 1248c34ac..b3a6743ef 100644 --- a/Mikado/tests/annotation.gff3 +++ b/Mikado/tests/annotation.gff3 @@ -1,4 +1,4 @@ -Chr5 TAIR10 gene 1223 5061 . - . ID=AT5G01010;Note=protein_coding_gene;Name=AT5G01010 +Chr5 TAIR10 gene 1251 5043 . - . ID=AT5G01010;Note=protein_coding_gene;Name=AT5G01010 Chr5 TAIR10 mRNA 1251 5043 . - . ID=AT5G01010.1;Parent=AT5G01010;Name=AT5G01010.1;Index=1 Chr5 TAIR10 five_prime_UTR 4925 5043 . - . Parent=AT5G01010.1 Chr5 TAIR10 CDS 4765 4924 . - 0 Parent=AT5G01010.1; diff --git a/Mikado/tests/annotation.gff3.stats b/Mikado/tests/annotation.gff3.stats new file mode 100644 index 000000000..424980298 --- /dev/null +++ b/Mikado/tests/annotation.gff3.stats @@ -0,0 +1,30 @@ +Stat Total Average Mode Min 1% 5% 10% 25% Median 75% 90% 95% 99% Max +Number of genes 2 NA NA NA NA NA NA NA NA NA NA NA NA NA +Number of genes (coding) 2 NA NA NA NA NA NA NA NA NA NA NA NA NA +Number of monoexonic genes 0 NA NA NA NA NA NA NA NA NA NA NA NA NA +Transcripts per gene 2 1.00 1 1 1 1 1 1 1 1 1 1 1 1 +Coding transcripts per gene 2 1.00 1 1 1 1 1 1 1 1 1 1 1 1 +CDNA lengths 3073 1,536.50 1500;1573 1,500 1,501 1,504 1,507 1,518 1,536 1,555 1,566 1,569 1,572 1,573 +CDNA lengths (mRNAs) 3073 1,536.50 1500;1573 1,500 1,501 1,504 1,507 1,518 1,536 1,555 1,566 1,569 1,572 1,573 +CDS lengths 2817 1,408.50 1317;1500 1,317 1,319 1,326 1,335 1,363 1,408 1,454 1,482 1,491 1,498 1,500 +CDS lengths (mRNAs) NA 1,408.50 1317;1500 1,317 1,319 1,326 1,335 1,363 1,408 1,454 1,482 1,491 1,498 1,500 +CDS/cDNA ratio NA 91.86 83.72536554354735;100.0 84 84 85 85 88 92 96 98 99 100 100 +Monoexonic transcripts 0 NA NA NA NA NA NA NA NA NA NA NA NA NA +MonoCDS transcripts 0 NA NA NA NA NA NA NA NA NA NA NA NA NA +Exons per transcript 23 11.50 8;15 8 8 8 9 10 12 13 14 15 15 15 +Exons per transcript (mRNAs) 22 11.50 8;15 8 8 8 9 10 12 13 14 15 15 15 +Exon lengths NA 133.61 75 36 37 42 49 75 128 174 229 274 307 315 +Exon lengths (mRNAs) NA 133.61 75 36 37 42 49 75 128 174 229 274 307 315 +Intron lengths NA 134.67 87 72 73 75 76 85 96 124 238 368 452 473 +Intron lengths (mRNAs) NA 134.67 87 72 73 75 76 85 96 124 238 368 452 473 +CDS exons per transcript 22 11.50 8;15 8 8 8 9 10 12 13 14 15 15 15 +CDS exons per transcript (mRNAs) 22 11.50 8;15 8 8 8 9 10 12 13 14 15 15 15 +CDS exon lengths 2817 122.48 75 36 37 42 49 74 117 156 216 230 297 315 +CDS Intron lengths 2807 133.67 86 71 72 74 75 84 95 123 237 367 451 472 +5'UTR exon number 2 0.50 0;1 0 0 0 0 0 0 1 1 1 1 1 +3'UTR exon number 2 0.50 0;1 0 0 0 0 0 0 1 1 1 1 1 +5'UTR length 119 59.50 0;119 0 1 6 12 30 60 89 107 113 118 119 +3'UTR length 137 68.50 0;137 0 1 7 14 34 68 103 123 130 136 137 +Stop distance from junction NA 0.00 0 0 0 0 0 0 0 0 0 0 0 0 +Intergenic distances NA 17,697.00 17697 17,697 17,697 17,697 17,697 17,697 17,697 17,697 17,697 17,697 17,697 17,697 +Intergenic distances (coding) NA 17,697.00 17697 17,697 17,697 17,697 17,697 17,697 17,697 17,697 17,697 17,697 17,697 17,697 diff --git a/Mikado/tests/test_system_calls.py b/Mikado/tests/test_system_calls.py index eba1b11d9..6f225ca42 100644 --- a/Mikado/tests/test_system_calls.py +++ b/Mikado/tests/test_system_calls.py @@ -1499,13 +1499,14 @@ def tearDownClass(cls): os.remove(cls.__genomefile__.name) os.remove(cls.fai.faidx.indexname) - @unittest.skip + #@unittest.skip def test_subprocess_multi(self): xml = pkg_resources.resource_filename("Mikado.tests", "chunk-001-proteins.xml.gz") transcripts = pkg_resources.resource_filename("Mikado.tests", "mikado_prepared.fasta") junctions = pkg_resources.resource_filename("Mikado.tests", "junctions.bed") orfs = pkg_resources.resource_filename("Mikado.tests", "transcripts.fasta.prodigal.gff3") + uniprot = pkg_resources.resource_filename("Mikado.tests", "uniprot_sprot_plants.fasta.gz") mobjects = 300 # Let's test properly the serialisation for BLAST procs = 3 @@ -1513,6 +1514,9 @@ def test_subprocess_multi(self): json_file = os.path.join(dir.name, "mikado.yaml") db = os.path.join(dir.name, "mikado.db") log = os.path.join(dir.name, "serialise.log") + uni_out = os.path.join(dir.name, "uniprot_sprot_plants.fasta") + with gzip.open(uniprot, "rb") as uni, open(uni_out, "wb") as uni_out_handle: + uni_out_handle.write(uni.read()) with open(json_file, "wt") as json_handle: sub_configure.print_config(yaml.dump(self.json_conf, default_flow_style=False), @@ -1520,12 +1524,11 @@ def test_subprocess_multi(self): # Set up the command arguments sys.argv = [str(_) for _ in ["mikado", "serialise", "--json-conf", json_file, - "--transcripts", transcripts, + "--transcripts", transcripts, "--blast_targets", uni_out, "--orfs", orfs, "--junctions", junctions, "--xml", xml, "-p", procs, "-mo", mobjects, db, "--log", log]] - with self.assertRaises(SystemExit): - pkg_resources.load_entry_point("Mikado", "console_scripts", "mikado")() + pkg_resources.load_entry_point("Mikado", "console_scripts", "mikado")() self.assertTrue(os.path.exists(db)) conn = sqlite3.connect(db) @@ -1536,12 +1539,37 @@ def test_subprocess_multi(self): self.assertEqual(cursor.execute("select count(distinct(query_id)) from hit").fetchall()[0][0], 71) self.assertEqual(cursor.execute("select count(distinct(target_id)) from hsp").fetchall()[0][0], 32) self.assertEqual(cursor.execute("select count(distinct(target_id)) from hit").fetchall()[0][0], 32) - self.assertEqual(cursor.execute("select count(*) from junctions").fetchall()[0][0], 371) - self.assertEqual(cursor.execute("select count(distinct(chrom_id)) from junctions").fetchall()[0][0], 1) + self.assertEqual(cursor.execute("select count(*) from junctions").fetchall()[0][0], 372) + self.assertEqual(cursor.execute("select count(distinct(chrom_id)) from junctions").fetchall()[0][0], 2) self.assertEqual(cursor.execute("select count(*) from orf").fetchall()[0][0], 169) self.assertEqual(cursor.execute("select count(distinct(query_id)) from orf").fetchall()[0][0], 81) dir.cleanup() +class StatsTest(unittest.TestCase): + + def test_annotation_stats(self): + + annotation_file = pkg_resources.resource_filename("Mikado.tests", "annotation.gff3") + annotation_check = pkg_resources.resource_filename("Mikado.tests", "annotation.gff3.stats") + + dir = tempfile.TemporaryDirectory() + out = os.path.join(dir.name, "annotation.gff3.stats") + sys.argv = [str(_) for _ in ["mikado", "util", "stats", annotation_file, out]] + # with self.assertRaises(SystemExit): + # pkg_resources.load_entry_point("Mikado", "console_scripts", "mikado")() + pkg_resources.load_entry_point("Mikado", "console_scripts", "mikado")() + + self.assertTrue(os.path.exists(out)) + + with open(annotation_check) as check: + check_lines = [_.strip() for _ in check] + + with open(out) as out_handle: + out_lines = [_.strip() for _ in out_handle] + + self.assertEqual(check_lines, out_lines) + + if __name__ == "__main__": unittest.main()