diff --git a/.gitignore b/.gitignore index 7e936a881..7600cbb7b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.pyc *.pyd +*.simg .#* #* *.coverage* diff --git a/Mikado/parsers/GFF.py b/Mikado/parsers/GFF.py index 448ccd733..3d492f44b 100755 --- a/Mikado/parsers/GFF.py +++ b/Mikado/parsers/GFF.py @@ -9,6 +9,7 @@ from . import Parser from .gfannotation import GFAnnotation from sys import intern +import re # This class has exactly how many attributes I need it to have @@ -16,6 +17,9 @@ class GffLine(GFAnnotation): """Object which serializes a GFF line.""" + # The (?:;|$) means "match, but **do not capture**, either semicolon or end of the line. + _attribute_pattern = re.compile(r"([^;]*)=([^$=]*)(?:;|$)") + def __init__(self, line, my_line='', header=False): """ Constructor method. @@ -41,19 +45,25 @@ def _parse_attributes(self): self.attribute_order = [] - for item in iter(x for x in self._attr.rstrip().split(';') if x != ''): - itemized = item.strip().split('=') - try: - if itemized[0].lower() == "parent": - self.parent = itemized[1].split(",") - - elif itemized[0].upper() == "ID": - self.id = itemized[1] - else: - self.attributes[itemized[0]] = itemized[1] - self.attribute_order.append(itemized[0]) - except IndexError: - pass + infolist = re.findall(self._attribute_pattern, self._attr.rstrip().rstrip(";")) + + for item in infolist: + key, val = item + if key.lower() == "parent": + self.parent = val.split(",") + elif key.upper() == "ID": + self.id = val + else: + try: + val = int(val) + except ValueError: + try: + val = float(val) + except ValueError: + pass + finally: + self.attributes[key] = val + self.attribute_order.append(key) def _format_attributes(self): """ diff --git a/Mikado/parsers/GTF.py b/Mikado/parsers/GTF.py index a44059887..051d6038a 100755 --- a/Mikado/parsers/GTF.py +++ b/Mikado/parsers/GTF.py @@ -6,6 +6,7 @@ from . import Parser from .gfannotation import GFAnnotation +import re # This class has exactly how many attributes I need it to have @@ -35,6 +36,8 @@ class GtfLine(GFAnnotation): # _slots=['chrom','source','feature','start',\ # 'end','score','strand','phase','info'] + _attribute_pattern = re.compile(r"([^;\s]*) \"([^\"]*)\"(?:;|$)") + def __init__(self, line, my_line='', header=False): self.__frame = None @@ -49,40 +52,29 @@ def _parse_attributes(self): :return: """ - # info_list = [] - for info in iter(x for x in self._attr.rstrip().split(';') if x != ''): - info = info.strip().split(' ') - # info_list.append(info) - # info = info.lstrip().split(' ') + # for info in iter(x for x in self._attr.rstrip().split(';') if x != ''): + # info = info.strip().split(' ') + # # info_list.append(info) + # # info = info.lstrip().split(' ') + # try: + # self.attributes[info[0]] = info[1].replace('"', '') + # except IndexError as exc: + # # something wrong has happened, let us just skip + # import sys + # print("Wrong attributes ({}) in line:\n{}".format(info, "\t".join(self._fields)), file=sys.stderr) + # if info[0] == "exon_number": + # self.attributes['exon_number'] = int(self.attributes['exon_number']) + + infodict = dict(re.findall(self._attribute_pattern, self._attr.rstrip())) + for key, val in infodict.items(): try: - self.attributes[info[0]] = info[1].replace('"', '') - except IndexError as exc: - # something wrong has happened, let us just skip - import sys - print("Wrong attributes ({}) in line:\n{}".format(info, "\t".join(self._fields)), file=sys.stderr) - if info[0] == "exon_number": - self.attributes['exon_number'] = int(self.attributes['exon_number']) - # elif info[0] in ("nearest_ref", "tss_id"): - # setattr(self, info[0], info[1]) - - # try: - # - # except IndexError: - # raise IndexError(info_list, info) - - # if 'exon_number' in self.attributes: - # self.attributes['exon_number'] = int(self.attributes['exon_number']) - assert 'gene_id', 'transcript_id' in self.attributes - - # if 'nearest_ref' in self.attributes: - # self.nearest_ref = self.attributes['nearest_ref'] - # if 'tss_id' in self.attributes: - # self.tss_id = self.attributes['tss_id'] - - # for tag in iter(att for att in self.attributes if - # att not in ('gene_id', 'transcript_id', 'nearest_ref', - # 'tss_id', 'class_code')): - # self.__dict__[tag.lower()] = self.attributes[tag] + val = int(val) + except ValueError: + try: + val = float(val) + except ValueError: + val = val.replace('"', '') + self.attributes[key] = val def _format_attributes(self): diff --git a/Mikado/tests/parser_test.py b/Mikado/tests/parser_test.py index b5704993e..e262955f4 100644 --- a/Mikado/tests/parser_test.py +++ b/Mikado/tests/parser_test.py @@ -293,6 +293,28 @@ def test_length(self): gtf_line.header, gtf_line.start, gtf_line.end = True, None, gtf_line.end self.assertEqual(len(gff_line), 0) + def test_pesky_gtf_line(self): + + line = """LG01\tbam2gtf\ttranscript\t44857\t46213\t60\t+\t.\tgene_id "transcript/12468.gene"; transcript_id "transcript/12468"; NM "92"; ms "842"; AS "806"; nn "0"; tp "P"; cm "145"; s1 "625"; s2 "372"; de "0.07109999656677246"; SA "GmG20150304_scaffold_7394,26960,-,673S2322M750D143S,60,68;GmG20150304_scaffold_5658,14965,+,50S192M723D2896S,37,22;"; coverage "100.0"; cigar "8M1I86M9I37M1I213M4I553M266N3M1I41M1D56M4I93M2028H”;""" + gtf_line = parsers.GTF.GtfLine(line) + self.assertEqual(gtf_line.feature, "transcript") + self.assertIn("SA", gtf_line.attributes.keys(), gtf_line.attributes) + self.assertEqual( + gtf_line.attributes["SA"], + "GmG20150304_scaffold_7394,26960,-,673S2322M750D143S,60,68;" + + "GmG20150304_scaffold_5658,14965,+,50S192M723D2896S,37,22;", + gtf_line.attributes) + + def test_pesky_gff_line(self): + + line = """LG01\tbam2gtf\ttranscript\t44857\t46213\t60\t+\t.\tID=transcript/12468;Parent="transcript/12468.gene";NM=92;ms=842;AS=806;nn=0;tp=P;cm=145;s1=625;s2=372;de=0.07109999656677246;SA=GmG20150304_scaffold_7394,26960,-,673S2322M750D143S,60,68;GmG20150304_scaffold_5658,14965,+,50S192M723D2896S,37,22;;coverage=100.0;cigar=8M1I86M9I37M1I213M4I553M266N3M1I41M1D56M4I93M2028H;""" + gff_line = parsers.GFF.GffLine(line) + self.assertEqual(gff_line.feature, "transcript") + self.assertEqual( + gff_line.attributes["SA"], + "GmG20150304_scaffold_7394,26960,-,673S2322M750D143S,60,68;" + + "GmG20150304_scaffold_5658,14965,+,50S192M723D2896S,37,22;") + if __name__ == '__main__': unittest.main() diff --git a/Mikado/tests/test_system_calls.py b/Mikado/tests/test_system_calls.py index 6f225ca42..bc55c55ae 100644 --- a/Mikado/tests/test_system_calls.py +++ b/Mikado/tests/test_system_calls.py @@ -1270,8 +1270,8 @@ def __get_purgeable_gff(self): Chr1 foo exon 100 800 . + . gene_id "foo1"; transcript_id "foo1.2" Chr1 foo exon 1900 2000 . + . gene_id "foo1"; transcript_id "foo1.2" Chr1 foo transcript 10000 20000 . + . gene_id "foo2"; transcript_id "foo2.1" - Chr1 foo exon 10000 13000 . + . gene_id "foo2; transcript_id "foo2.1" - Chr1 foo exon 19000 20000 . + . gene_id "foo"; transcript_id "foo2.1""" + Chr1 foo exon 10000 13000 . + . gene_id "foo2"; transcript_id "foo2.1" + Chr1 foo exon 19000 20000 . + . gene_id "foo"; transcript_id "foo2.1\"""" dir = tempfile.TemporaryDirectory() temp_gtf = tempfile.NamedTemporaryFile(mode="wt", suffix=".gtf", dir=dir.name, delete=True) @@ -1336,10 +1336,10 @@ def test_purging1(self): with to_gff(os.path.join(dir.name, self.json_conf["pick"]["files"]["loci_out"])) as gff: - lines = [line for line in gff if line.header is False] self.assertGreater(len(lines), 0) - self.assertTrue(any([_ for _ in lines if _.attributes.get("alias", "") == "foo2.1"])) + self.assertTrue(any([_ for _ in lines if _.attributes.get("alias", "") == "foo2.1"]), + "\n".join([str(_) for _ in lines])) if purging is True: self.assertFalse(any([_ for _ in lines if _.attributes.get("alias", "") in ("foo1.2", "foo1.1")])) else: @@ -1460,7 +1460,8 @@ def test_purging3(self): self.json_conf["pick"]["files"]["loci_out"])) as gff: lines = [line for line in gff if line.header is False] self.assertGreater(len(lines), 0) - self.assertTrue(any([_ for _ in lines if _.attributes.get("alias", "") == "foo2.1"])) + self.assertTrue(any([_ for _ in lines if _.attributes.get("alias", "") == "foo2.1"]), + "\n".join([str(_) for _ in lines])) if purging is True: self.assertFalse(any([_ for _ in lines if _.attributes.get("alias", "") == "foo1.2"])) else: diff --git a/Mikado/transcripts/transcript.py b/Mikado/transcripts/transcript.py index b6cc74140..d47587d3b 100644 --- a/Mikado/transcripts/transcript.py +++ b/Mikado/transcripts/transcript.py @@ -489,7 +489,6 @@ def __initialize_with_gf(self, transcript_row: (GffLine, GtfLine)): booleans = {"True": True, "False": False, "None": None} for key, val in transcript_row.attributes.items(): - if not isinstance(val, Hashable): pass elif val in booleans: @@ -497,7 +496,7 @@ def __initialize_with_gf(self, transcript_row: (GffLine, GtfLine)): else: try: val = int(val) - except ValueError: + except (ValueError, OverflowError): try: val = float(val) except ValueError: diff --git a/Singularity.centos.def b/Singularity.centos.def index 802429930..f83c5e387 100644 --- a/Singularity.centos.def +++ b/Singularity.centos.def @@ -19,9 +19,11 @@ Include: yum wget mikado --help %environment + export MIKADO_COMMIT_HASH=$(cd /usr/local/src/mikado && git log | head -n 1 | cut -f 2 -d " ") export PATH="/usr/local/bin:$PATH:/usr/local/conda/bin/" # source /usr/local/conda/bin/activate + %post ### Install your packages ### @@ -52,22 +54,19 @@ Include: yum wget # Install python requirements git clone https://github.com/EI-CoreBioinformatics/mikado.git cd mikado + git log | head -n 1 | cut -f 2 -d " " > ~/MIKADO_COMMIT_HASH sed -i 's/;.*//' requirements.txt conda install --update-all -y -c conda-forge -c bioconda -c anaconda --file requirements.txt python setup.py bdist_wheel pip install dist/*whl + echo '#!/bin/bash' >> /usr/local/bin/show_commit_hash + echo 'cd /usr/local/src/mikado' >> /usr/local/bin/show_commit_hash + echo 'git log | head -n1 | cut -f 2 -d " "' >> /usr/local/bin/show_commit_hash + chmod 775 /usr/local/bin/show_commit_hash + # Various dependencies for Daijin - conda install -y -c bioconda -c anaconda -c conda-forge samtools==1.9 openssl=1.0 prodigal blast diamond==0.9.24 transdecoder==5.5.0 stringtie==1.3.4 cufflinks==2.2.1 hisat2==2.1.0 gmap==2018.07.04 portcullis trinity star==2.7.0b minimap2==2.15 - cd /opt/software - wget https://github.com/Kingsford-Group/scallop/releases/download/v0.10.3/scallop-0.10.3_linux_x86_64.tar.gz && tar xaf scallop-0.10.3_linux_x86_64.tar.gz - mv scallop-0.10.3_linux_x86_64/scallop /usr/local/conda/bin/ && rm -rf scallop-0.10.3_linux_x86_64 scallop-0.10.3_linux_x86_64.tar.gz - yum install -y unzip - wget https://github.com/mourisl/CLASS/archive/v2.1.7.zip && unzip v2.1.7.zip && rm v2.1.7.zip - cd /opt/software/CLASS-2.1.7/ && sh build.sh && mv -t /usr/local/conda/bin/ class junc clnb grader addXS - cd /opt/software/ - rm -rf /opt/software/CLASS-2.1.7/ - cd /mnt/ + conda install -y -c bioconda -c anaconda -c conda-forge samtools==1.9 openssl=1.0 prodigal blast diamond==0.9.24 transdecoder==5.5.0 %apprun snakemake snakemake "@" @@ -75,3 +74,14 @@ Include: yum wget %apprun mikado mikado "@" +%apprun daijin + daijin "@" + +%apprun prodigal + prodigal "@" + +%apprun samtools + samtools "@" + +%apprun diamond + diamond "@" diff --git a/Singularity.ubuntu.def b/Singularity.ubuntu.def index f477d4273..974c97b1f 100644 --- a/Singularity.ubuntu.def +++ b/Singularity.ubuntu.def @@ -14,10 +14,10 @@ Include: apt wget mikado --help %environment + export MIKADO_COMMIT_HASH=$(cd /usr/local/src/mikado && git log | head -n 1 | cut -f 2 -d " ") export PATH="/usr/local/bin:$PATH:/usr/local/conda/bin/" # . /usr/local/conda/bin/activate - # source /usr/local/conda/bin/activate - # conda activate python36 + %post @@ -30,7 +30,7 @@ Include: apt wget gcc --version make --version - # Clean up yum + # Clean up apt apt clean cd /usr/local/src @@ -38,35 +38,25 @@ Include: apt wget bash Miniconda3-latest-Linux-x86_64.sh -b -p /usr/local/conda export PATH="/usr/local/conda/bin:$PATH" . /usr/local/conda/bin/activate - # sudo bash -c "/usr/local/conda/bin/conda init bash" - conda update -n base -c defaults conda + conda update -y -n base -c defaults conda ln -s /usr/local/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh conda install -y -c conda-forge pip python==3.6.7 - - python3 --version - cd $(dirname $(which python3)) - cd /opt/software/ - # Install python requirements git clone https://github.com/EI-CoreBioinformatics/mikado.git cd mikado + git log | head -n 1 | cut -f 2 -d " " > MIKADO_COMMIT_HASH sed -i 's/;.*//' requirements.txt conda install --update-all -y -c conda-forge -c bioconda -c anaconda --file requirements.txt python setup.py bdist_wheel pip install dist/*whl - # mikado --help + + echo '#!/bin/bash' >> /usr/local/bin/show_commit_hash + echo 'cd /usr/local/src/mikado' >> /usr/local/bin/show_commit_hash + echo 'git log | head -n1 | cut -f 2 -d " "' >> /usr/local/bin/show_commit_hash + chmod 775 /usr/local/bin/show_commit_hash # Various dependencies for Daijin - conda install -y -c bioconda -c anaconda -c conda-forge samtools==1.9 openssl=1.0 prodigal blast diamond==0.9.24 transdecoder==5.5.0 stringtie==1.3.4 cufflinks==2.2.1 hisat2==2.1.0 gmap==2018.07.04 portcullis trinity star==2.7.0b minimap2==2.15 - cd /opt/software - wget https://github.com/Kingsford-Group/scallop/releases/download/v0.10.3/scallop-0.10.3_linux_x86_64.tar.gz && tar xaf scallop-0.10.3_linux_x86_64.tar.gz - mv scallop-0.10.3_linux_x86_64/scallop /usr/local/conda/bin/ && rm -rf scallop-0.10.3_linux_x86_64 scallop-0.10.3_linux_x86_64.tar.gz - apt install -y unzip - wget https://github.com/mourisl/CLASS/archive/v2.1.7.zip && unzip v2.1.7.zip && rm v2.1.7.zip - cd /opt/software/CLASS-2.1.7/ && sh build.sh && mv -t /usr/local/conda/bin/ class junc clnb grader addXS - cd /opt/software/ - rm -rf /opt/software/CLASS-2.1.7/ - cd /mnt/ + conda install -y -c bioconda -c anaconda -c conda-forge samtools==1.9 openssl=1.0 prodigal blast diamond==0.9.24 transdecoder==5.5.0 %apprun snakemake snakemake "@" @@ -74,3 +64,20 @@ Include: apt wget %apprun mikado mikado "@" +%apprun daijin + daijin "@" + +%apprun snakemake + snakemake "@" + +%apprun mikado + mikado "@" + +%apprun prodigal + prodigal "@" + +%apprun samtools + samtools "@" + +%apprun diamond + diamond "@" diff --git a/conda-1.5/meta.yaml b/conda-1.5/meta.yaml new file mode 100644 index 000000000..e08b95221 --- /dev/null +++ b/conda-1.5/meta.yaml @@ -0,0 +1,110 @@ +{% set name = "mikado" %} +{% set version = "1.5" %} +{% set file_ext = "tar.gz" %} +{% set hash_type = "sha256" %} +{% set hash_value = "" %} + +package: + name: '{{ name|lower }}' + version: '{{ version }}' + +source: + fn: '{{ version }}.{{ file_ext }}' + url: https://github.com/lucventurini/mikado/archive/1.5.tar.gz + '{{ hash_type }}': '{{ hash_value }}' + +build: + skip: True # [py27 or osx or py33 or py34] + number: 0 + entry_points: + - mikado = Mikado:main + - daijin = Mikado.daijin:main + script: $PYTHON setup.py install --single-version-externally-managed --record=record.txt + +requirements: + build: + - python + - setuptools + - wheel >=0.28.0 + - pyyaml + - jsonschema + - cython >=0.28.2 + - numpy >=1.16 + - networkx >=1.10 + - sqlalchemy >=1 + - sqlalchemy-utils + - biopython >=1.70 + - intervaltree + - nose + - pyfaidx + - scikit-learn >=0.20 + - scipy >=0.15.0 + - frozendict + - libmagic + - python-magic + - drmaa + - snakemake + - docutils !=0.13.1 + - tabulate + - simplejson + - ujson + - typing + - pytest-cov + run: + - python + - wheel >=0.28.0 + - pyyaml + - jsonschema + - cython >=0.28.2 + - numpy + - networkx >=1.10 + - sqlalchemy >=1 + - sqlalchemy-utils + - biopython >=1.70 + - intervaltree + - pyfaidx + - scikit-learn >=0.20 + - scipy >=0.15.0 + - frozendict + - libmagic + - python-magic + - drmaa + - snakemake + - simplejson + - docutils !=0.13.1 + - tabulate + - ujson + - portcullis + - typing + - pandas + - pytest-cov + +test: + imports: + - Mikado + - Mikado.configuration + - Mikado.daijin + - Mikado.loci + - Mikado.parsers + - Mikado.picking + - Mikado.preparation + - Mikado.scales + - Mikado.serializers + - Mikado.serializers.blast_serializer + - Mikado.subprograms + - Mikado.subprograms.util + - Mikado.tests + - Mikado.transcripts + - Mikado.transcripts.transcript_methods + - Mikado.utilities + commands: + - mikado --help + - daijin --help + - python -c "import Mikado; Mikado.test()" + +about: + home: https://github.com/EI-CoreBioinformatics/mikado/ + license: GNU Lesser General Public License v3 or later (LGPLv3+) + license_family: LGPL + summary: A Python3 annotation program to select the best gene model in each locus +