diff --git a/.travis.yml b/.travis.yml index 9d1507acd..8c96c7992 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,11 +18,8 @@ install: - sed -i "s/defaults::python.*/defaults::python=$TRAVIS_PYTHON_VERSION/" environment.yml - conda env create -n env_name --file environment.yml - source activate env_name - - conda install --yes setuptools cython atlas numpy scipy scikit-learn biopython - - conda install --yes -c bioconda diamond prodigal samtools - pip install pytest-cov codecov; - if [[ "$(python -c "import sys; print(sys.version_info.minor)")" == "7" ]]; then wget https://github.com/pytries/datrie/archive/0.7.1.tar.gz; tar xf 0.7.1.tar.gz; cd datrie-0.7.1; ./update_c.sh; python3.7 setup.py build; python3.7 setup.py install; cd ../; fi; - - pip install -r requirements.txt - python setup.py develop; script: - touch plants.yaml && python -c "import Mikado; print(Mikado.__version__)" && rm plants.yaml; # This is to verify we fixed bug #124 diff --git a/CHANGELOG.md b/CHANGELOG.md index d9b4266c1..06d7d716e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ Users are ***very strongly recommended*** to update Mikado as soon as possible. **IMPORTANT**: this release has completely overhauled the scoring files. We now provide only two ("plant.yaml" and "mammalian.yaml"). "Plant.yaml" should function also for insect or fungal species, but we have not tested it extensively. Old scoring files can be found under "HISTORIC". Two of the major highlits of this release are: - - the completion of the "padding" functionality. Briefly, if instructed to do so, now Mikado will be able to uniform the ends of transcripts within a single locus (similar to what was done for the last _Arabidopsis thaliana_ annotation release). The behaviour is controlled by the "pad" boolean switch, and by the "ts_max_splices" and "ts_distance" parameters under "pick". Please note that now "ts_distance" refers to the **transcriptomic** distance, ie, long introns are not considered for this purpose. + - the completion of the "padding" functionality. Briefly, if instructed to do so, now Mikado will be able to uniform the ends of transcripts within a single locus (similar to what was done for the last _Arabidopsis thaliana_ annotation release). The behaviour is controlled by the "pad" boolean switch, and by the "ts_max_splices" and "ts_distance" parameters under "pick". Please note that now "ts_distance" refers to the **transcriptomic** distance, ie, long introns are not considered for this purpose. - general improvements in speed and multiprocessing, as well as flexibility, for the Mikado compare utility. With this release, we are also officially dropping support for Python 3.4. Python 3.5 will not be automatically tested for, as many Conda dependencies are not up-to-date, complicating the TRAVIS setup. diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json index 966c70caf..f21ccb4ce 100644 --- a/Mikado/configuration/configuration_blueprint.json +++ b/Mikado/configuration/configuration_blueprint.json @@ -239,6 +239,9 @@ "minimum_length": { "type": "integer", "default": 200, "minimum": 1 }, + "max_intron_length": { + "type": "integer", "default": 1000000, "minimum": 20 + }, "procs": {"type": "integer", "default": 1}, "files": { "Comment": ["Options related to the input and output files.", diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py index f0cdb26d4..d550d5073 100644 --- a/Mikado/loci/locus.py +++ b/Mikado/loci/locus.py @@ -39,6 +39,11 @@ def __init__(self, transcript: Transcript, logger=None, json_conf=None, **kwargs self.counter = 0 transcript.attributes["primary"] = True + if transcript.is_coding: + transcript.feature = "mRNA" + else: + transcript.feature = "ncRNA" + self.counter = 0 # simple tag to avoid collisions Abstractlocus.__init__(self, logger=logger, json_conf=json_conf, **kwargs) # this must be defined straight away @@ -373,7 +378,12 @@ def add_transcript_to_locus(self, transcript: Transcript, check_in_locus=True, transcript.id, self.id) transcript.attributes["primary"] = False + if transcript.is_coding: + transcript.feature = "mRNA" + else: + transcript.feature = "ncRNA" Abstractlocus.add_transcript_to_locus(self, transcript) + self.locus_verified_introns.update(transcript.verified_introns) def __check_as_requirements(self, transcript: Transcript) -> bool: diff --git a/Mikado/parsers/bed12.py b/Mikado/parsers/bed12.py index eff67adfe..4198292df 100644 --- a/Mikado/parsers/bed12.py +++ b/Mikado/parsers/bed12.py @@ -8,7 +8,6 @@ import random import os -from Bio import SeqIO from Bio import Seq import Bio.SeqRecord from . import Parser @@ -19,6 +18,7 @@ import re from ..utilities.log_utils import create_null_logger from Bio.Data import CodonTable +import pysam standard = CodonTable.ambiguous_dna_by_id[1] standard.start_codons = ["ATG"] @@ -396,16 +396,25 @@ def __check_validity(self, transcriptomic, fasta_index, sequence): self.validity_checked = True if sequence is not None: self.fasta_length = len(sequence) - if isinstance(sequence, str): - sequence = Seq.Seq(sequence) + if hasattr(sequence, "seq"): + sequence = str(sequence.seq) + if not isinstance(sequence, str): + sequence = str(sequence) + # if isinstance(sequence, str): + # sequence = Seq.Seq(sequence) else: if self.id not in fasta_index: self.__in_index = False return self.fasta_length = len(fasta_index[self.id]) - sequence = fasta_index[self.id].seq + sequence = fasta_index[self.id] + if hasattr(sequence, "seq"): + sequence = str(sequence.seq) + if not isinstance(sequence, str): + sequence = str(sequence) + assert isinstance(sequence, str) # Just double check that the sequence length is the same as what the BED would suggest if self.invalid is True: self.coding = False @@ -415,8 +424,9 @@ def __check_validity(self, transcriptomic, fasta_index, sequence): orf_sequence = sequence[ (self.thick_start - 1 if not self.phase else self.start + self.phase - 1):self.thick_end] else: - orf_sequence = sequence[(self.thick_start - 1):( - self.thick_end if not self.phase else self.end - (3 - self.phase) % 3)].reverse_complement() + orf_sequence = Seq.reverse_complement( + sequence[(self.thick_start - 1):( + self.thick_end if not self.phase else self.end - (3 - self.phase) % 3)]) self.start_codon = str(orf_sequence)[:3].upper() self.stop_codon = str(orf_sequence[-3:]).upper() @@ -430,8 +440,8 @@ def __check_validity(self, transcriptomic, fasta_index, sequence): self.has_start_codon = False if self.start_adjustment is True: - if self.strand == "-": - sequence = sequence.reverse_complement() + # if self.strand == "-": + # sequence = Seq.reverse_complement(sequence) self._adjust_start(sequence, orf_sequence) if self.stop_codon in self.table.stop_codons: @@ -446,7 +456,7 @@ def __check_validity(self, transcriptomic, fasta_index, sequence): last_pos = -3 - ((len(orf_sequence)) % 3) if self.__lenient is False: - translated_seq = orf_sequence[:last_pos].translate(table=self.table, gap='N') + translated_seq = Seq.translate(orf_sequence[:last_pos], table=self.table, gap='N') self.__internal_stop_codons = str(translated_seq).count("*") if self.invalid is True: @@ -457,22 +467,27 @@ def _adjust_start(self, sequence, orf_sequence): assert len(orf_sequence) == (self.thick_end - self.thick_start + 1) # Let's check UPstream first. # This means that we DO NOT have a starting Met and yet we are starting far upstream. - if (self.strand == "+" and self.thick_start > 3) or (self.strand == "-" and self.end - self.thick_end > 3): + if self.strand == "+" and self.thick_start > 3: for pos in range(self.thick_start, 3, -3): - if self.strand == "+": - self.thick_start -= 3 - else: - self.thick_end += 3 + self.thick_start -= 3 if sequence[pos - 3:pos] in self.table.start_codons: # We have found a valid methionine. break elif sequence[pos - 3:pos] in self.table.stop_codons: - if self.strand == "+": - self.thick_start += 3 - else: - self.thick_end -= 3 + self.thick_start += 3 break continue + + elif self.strand == "-" and self.end - self.thick_end > 3: + for pos in range(self.thick_end, self.end - 3, 3): + self.thick_end += 3 + if Seq.reverse_complement(sequence[pos - 3:pos]) in self.table.start_codons: + # We have found a valid methionine. + break + elif Seq.reverse_complement(sequence[pos - 3:pos]) in self.table.stop_codons: + self.thick_end -= 3 + break + print("Thick end:", self.thick_end) else: for pos in range(3, int(len(orf_sequence) * self.max_regression), @@ -488,6 +503,8 @@ def _adjust_start(self, sequence, orf_sequence): break else: continue + print("Thick end:", self.thick_end) + if self.has_start_codon is False: # The validity will be automatically checked if self.strand == "+": @@ -863,7 +880,7 @@ def expand(self, sequence, upstream, downstream, expand_orf=False, logger=create if len(coding_seq) % 3 != 0: # Only get a multiple of three coding_seq = coding_seq[:-((len(coding_seq)) % 3)] - prot_seq = coding_seq.translate(table=self.table, gap="N") + prot_seq = Seq.translate(coding_seq, table=self.table, gap="N") if "*" in prot_seq: self.thick_end = self.thick_start + self.phase - 1 + (1 + prot_seq.find("*")) * 3 self.stop_codon = coding_seq[prot_seq.find("*") * 3:(1 + prot_seq.find("*")) * 3].upper() @@ -1020,9 +1037,9 @@ def __init__(self, handle, elif fasta_index is not None: if isinstance(fasta_index, str): assert os.path.exists(fasta_index) - fasta_index = SeqIO.index(fasta_index, "fasta") + fasta_index = pysam.FastaFile(fasta_index) else: - assert "SeqIO" in repr(fasta_index) and "index" in repr(fasta_index) + assert isinstance(fasta_index, pysam.FastaFile) self.fasta_index = fasta_index self.__closed = False diff --git a/Mikado/preparation/annotation_parser.py b/Mikado/preparation/annotation_parser.py index b663f56e6..cb11a229d 100644 --- a/Mikado/preparation/annotation_parser.py +++ b/Mikado/preparation/annotation_parser.py @@ -26,12 +26,14 @@ def __init__(self, logging_queue, identifier, min_length=0, + max_intron=3*10**5, log_level="WARNING", strip_cds=False): super().__init__() self.submission_queue = submission_queue self.min_length = min_length + self.max_intron = max_intron self.__strip_cds = strip_cds self.logging_queue = logging_queue self.log_level = log_level @@ -86,6 +88,7 @@ def run(self): found_ids, self.logger, min_length=self.min_length, + max_intron=self.max_intron, strip_cds=self.__strip_cds, is_reference=is_reference, strand_specific=strand_specific) @@ -96,6 +99,7 @@ def run(self): found_ids, self.logger, min_length=self.min_length, + max_intron=self.max_intron, is_reference=is_reference, strip_cds=self.__strip_cds, strand_specific=strand_specific) @@ -107,6 +111,7 @@ def run(self): self.logger, is_reference=is_reference, min_length=self.min_length, + max_intron=self.max_intron, strip_cds=self.__strip_cds, strand_specific=strand_specific) else: @@ -164,7 +169,7 @@ def __raise_invalid(row_id, name, label): "(label: {0})".format(label) if label != '' else "")) -def load_into_storage(shelf_name, exon_lines, min_length, logger, strip_cds=True): +def load_into_storage(shelf_name, exon_lines, min_length, logger, strip_cds=True, max_intron=3*10**5): """Function to load the exon_lines dictionary into the temporary storage.""" @@ -246,9 +251,17 @@ def load_into_storage(shelf_name, exon_lines, min_length, logger, strip_cds=True else: raise KeyError(exon_lines[tid]["features"]) - tlength = sum(exon[1] + 1 - exon[0] for exon in segments) - start = min((_[0] for _ in segments)) - end = max((_[1] for _ in segments)) + segments = sorted(segments, key=itemgetter(0)) + tlength = 0 + start, end = segments[0][0], segments[-1][1] + biggest_intron = -1 + num_segments = len(segments) + for pos, segment in enumerate(segments): + if pos < num_segments - 1: + later = segments[pos + 1] + biggest_intron = max(biggest_intron, + later[0] - (segment[1] + 1)) + tlength += segment[1] + 1 - segment[0] # Discard transcript under a certain size if tlength < min_length: @@ -260,6 +273,17 @@ def load_into_storage(shelf_name, exon_lines, min_length, logger, strip_cds=True tid, tlength, min_length) continue + # Discard transcripts with introns over the limit + if biggest_intron > max(-1, max_intron): + if exon_lines[tid]["is_reference"] is True: + logger.info( + "%s retained even if its longest intron is over the limit (%d) as it is a reference transcript.", + tid, biggest_intron) + else: + logger.info("Discarding %s because its longest intron (%d) is over the maximum of %d", + tid, biggest_intron, max_intron) + continue + values = json.dumps(exon_lines[tid]) logger.debug("Inserting %s into shelf %s", tid, shelf_name) @@ -282,6 +306,7 @@ def load_from_gff(shelf_name, found_ids, logger, min_length=0, + max_intron=3*10**5, is_reference=False, strip_cds=False, strand_specific=False): @@ -297,6 +322,8 @@ def load_from_gff(shelf_name, :type logger: logging.Logger :param min_length: minimum length for a cDNA to be considered as valid :type min_length: int + :param max_intron: maximum intron length for a cDNA to be considered as valid + :type max_intron: int :param strip_cds: boolean flag. If true, all CDS lines will be ignored. :type strip_cds: bool :param strand_specific: whether the assembly is strand-specific or not. @@ -422,7 +449,7 @@ def load_from_gff(shelf_name, logger.info("Starting to load %s", shelf_name) load_into_storage(shelf_name, exon_lines, - logger=logger, min_length=min_length, strip_cds=strip_cds) + logger=logger, min_length=min_length, strip_cds=strip_cds, max_intron=max_intron) return new_ids @@ -433,6 +460,7 @@ def load_from_gtf(shelf_name, found_ids, logger, min_length=0, + max_intron=3*10**5, is_reference=False, strip_cds=False, strand_specific=False): @@ -448,6 +476,8 @@ def load_from_gtf(shelf_name, :type logger: logging.Logger :param min_length: minimum length for a cDNA to be considered as valid :type min_length: int + :param max_intron: maximum intron length for a cDNA to be considered as valid + :type max_intron: int :param strip_cds: boolean flag. If true, all CDS lines will be ignored. :type strip_cds: bool :param strand_specific: whether the assembly is strand-specific or not. @@ -537,7 +567,7 @@ def load_from_gtf(shelf_name, logger.info("Starting to load %s", shelf_name) load_into_storage(shelf_name, exon_lines, - logger=logger, min_length=min_length, strip_cds=strip_cds) + logger=logger, min_length=min_length, strip_cds=strip_cds, max_intron=max_intron) return new_ids @@ -548,6 +578,7 @@ def load_from_bed12(shelf_name, found_ids, logger, min_length=0, + max_intron=3*10**5, is_reference=False, strip_cds=False, strand_specific=False): @@ -563,6 +594,8 @@ def load_from_bed12(shelf_name, :type logger: logging.Logger :param min_length: minimum length for a cDNA to be considered as valid :type min_length: int + :param max_intron: maximum intron length for a cDNA to be considered as valid + :type max_intron: int :param strip_cds: boolean flag. If true, all CDS lines will be ignored. :type strip_cds: bool :param strand_specific: whether the assembly is strand-specific or not. @@ -621,6 +654,6 @@ def load_from_bed12(shelf_name, new_ids.add(transcript.id) gff_handle.close() load_into_storage(shelf_name, exon_lines, - logger=logger, min_length=min_length, strip_cds=strip_cds) + logger=logger, min_length=min_length, strip_cds=strip_cds, max_intron=max_intron) return new_ids \ No newline at end of file diff --git a/Mikado/preparation/prepare.py b/Mikado/preparation/prepare.py index 22501d15b..290cc751d 100644 --- a/Mikado/preparation/prepare.py +++ b/Mikado/preparation/prepare.py @@ -240,7 +240,7 @@ def perform_check(keys, shelve_stacks, args, logger): return -def _load_exon_lines_single_thread(args, shelve_names, logger, min_length, strip_cds): +def _load_exon_lines_single_thread(args, shelve_names, logger, min_length, strip_cds, max_intron): logger.info("Starting to load lines from %d files (single-threaded)", len(args.json_conf["prepare"]["files"]["gff"])) @@ -264,6 +264,7 @@ def _load_exon_lines_single_thread(args, shelve_names, logger, min_length, strip found_ids, logger, min_length=min_length, + max_intron=max_intron, strip_cds=strip_cds and not is_reference, is_reference=is_reference, strand_specific=strand_specific or is_reference) @@ -274,6 +275,7 @@ def _load_exon_lines_single_thread(args, shelve_names, logger, min_length, strip found_ids, logger, min_length=min_length, + max_intron=max_intron, strip_cds=strip_cds and not is_reference, is_reference=is_reference, strand_specific=strand_specific or is_reference) @@ -284,6 +286,7 @@ def _load_exon_lines_single_thread(args, shelve_names, logger, min_length, strip found_ids, logger, min_length=min_length, + max_intron=max_intron, strip_cds=strip_cds and not is_reference, is_reference=is_reference, strand_specific=strand_specific or is_reference) @@ -292,7 +295,7 @@ def _load_exon_lines_single_thread(args, shelve_names, logger, min_length, strip return -def _load_exon_lines_multi(args, shelve_names, logger, min_length, strip_cds, threads): +def _load_exon_lines_multi(args, shelve_names, logger, min_length, strip_cds, threads, max_intron=3*10**5): logger.info("Starting to load lines from %d files (using %d processes)", len(args.json_conf["prepare"]["files"]["gff"]), threads) submission_queue = multiprocessing.JoinableQueue(-1) @@ -306,6 +309,7 @@ def _load_exon_lines_multi(args, shelve_names, logger, min_length, strip_cds, th num + 1, log_level=args.level, min_length=min_length, + max_intron=max_intron, strip_cds=strip_cds) proc.start() working_processes.append(proc) @@ -345,7 +349,7 @@ def _load_exon_lines_multi(args, shelve_names, logger, min_length, strip_cds, th gc.collect() -def load_exon_lines(args, shelve_names, logger, min_length=0): +def load_exon_lines(args, shelve_names, logger, min_length=0, max_intron=3*10**5): """This function loads all exon lines from the GFF inputs into a defaultdict instance. @@ -353,8 +357,8 @@ def load_exon_lines(args, shelve_names, logger, min_length=0): :param shelve_names: list of names of the shelf DB files. :param logger: the logger instance. :type logger: logging.Logger - :param min_length: minimal length of the transcript. - If it is not met, the transcript will be discarded. + :param min_length: minimal length of the transcript. If it is not met, the transcript will be discarded. + :param max_intron: maximum length for an intron. If it is not met, the transcript will be discarded. :type min_length: int f :return: exon_lines @@ -366,9 +370,9 @@ def load_exon_lines(args, shelve_names, logger, min_length=0): strip_cds = args.json_conf["prepare"]["strip_cds"] if args.json_conf["prepare"]["single"] is True or threads == 1: - _load_exon_lines_single_thread(args, shelve_names, logger, min_length, strip_cds) + _load_exon_lines_single_thread(args, shelve_names, logger, min_length, strip_cds, max_intron) else: - _load_exon_lines_multi(args, shelve_names, logger, min_length, strip_cds, threads) + _load_exon_lines_multi(args, shelve_names, logger, min_length, strip_cds, threads, max_intron) logger.info("Finished loading lines from %d files", len(args.json_conf["prepare"]["files"]["gff"])) @@ -457,7 +461,8 @@ def prepare(args, logger): load_exon_lines(args, shelve_names, logger, - min_length=args.json_conf["prepare"]["minimum_length"]) + min_length=args.json_conf["prepare"]["minimum_length"], + max_intron=args.json_conf["prepare"]["max_intron_length"],) logger.info("Finished loading exon lines") diff --git a/Mikado/serializers/orf.py b/Mikado/serializers/orf.py index 704c1b72e..459f2aed3 100644 --- a/Mikado/serializers/orf.py +++ b/Mikado/serializers/orf.py @@ -179,7 +179,6 @@ def __init__(self, if isinstance(fasta_index, str): assert os.path.exists(fasta_index) self.fasta_index = pysam.FastaFile(fasta_index) - # self.fasta_index = SeqIO.index(fasta_index, "fasta") elif fasta_index is None: exc = ValueError("A fasta index is needed for the serialization!") self.logger.exception(exc) diff --git a/Mikado/subprograms/configure.py b/Mikado/subprograms/configure.py index 34de41dd4..b7fe25bd7 100644 --- a/Mikado/subprograms/configure.py +++ b/Mikado/subprograms/configure.py @@ -166,6 +166,12 @@ def create_config(args): if args.seed is not None: config["seed"] = args.seed + if args.min_cdna_length not in (None, False): + config["prepare"]["minimum_length"] = args.min_cdna_length + + if args.max_intron_length not in (None, False): + config["prepare"]["max_intron_length"] = args.max_intron_length + if args.reference is not None: config["reference"]["genome"] = args.reference @@ -341,6 +347,11 @@ def configure_parser(): parser.add_argument("--full", action="store_true", default=False) parser.add_argument("--seed", type=int, default=None, help="Random seed number.") + preparer = parser.add_argument_group("Options related to the prepare stage.") + preparer.add_argument("--mininimum-cdna-length", default=None, type=int, dest="min_cdna_length", + help="Minimum cDNA length for transcripts.") + preparer.add_argument("--max-intron-size", default=None, type=int, dest="max_intron_length", + help="Maximum intron length for transcripts.") scoring = parser.add_argument_group("Options related to the scoring system") scoring.add_argument("--scoring", type=str, default=None, help="Scoring file to use. Mikado provides the following:\n{}".format( @@ -355,9 +366,9 @@ def configure_parser(): help="""Range into which intron lengths should fall, as a couple of integers. Transcripts with intron lengths outside of this range will be penalised. Default: (60, 900)""") - picking.add_argument("--pad", default=False, - action="store_true", - help="Whether to pad transcripts in loci.") + picking.add_argument("--no-pad", default=True, dest="pad", + action="store_false", + help="Whether to disable padding transcripts.") parser.add_argument("--strand-specific", default=False, action="store_true", help="""Boolean flag indicating whether all the assemblies are strand-specific.""") diff --git a/Mikado/subprograms/pick.py b/Mikado/subprograms/pick.py index 560f19d90..a9a964670 100644 --- a/Mikado/subprograms/pick.py +++ b/Mikado/subprograms/pick.py @@ -53,6 +53,10 @@ def check_run_options(args, logger=create_null_logger()): args.json_conf["pick"]["run_options"]["single_thread"] = args.single + if args.seed is not None: + args.json_conf["seed"] = args.seed + random.seed(args.seed, version=2) + if args.no_cds is not False: args.json_conf["pick"]["run_options"]["exclude_cds"] = True if args.no_purge is True: diff --git a/Mikado/subprograms/prepare.py b/Mikado/subprograms/prepare.py index a5a982cfb..610613646 100644 --- a/Mikado/subprograms/prepare.py +++ b/Mikado/subprograms/prepare.py @@ -154,7 +154,7 @@ def setup(args): args.labels = [""] * len(args.json_conf["prepare"]["files"]["gff"]) args.json_conf["prepare"]["files"]["labels"] = args.labels - for option in ["minimum_length", "procs", "single"]: + for option in ["minimum_length", "procs", "single", "max_intron_length"]: if getattr(args, option) in (None, False): continue else: @@ -268,6 +268,8 @@ def positive(string): splices will be output as well.""") parser.add_argument("-m", "--minimum_length", default=None, type=positive, help="Minimum length for transcripts. Default: 200 bps.") + parser.add_argument("-MI", "--max-intron-length", default=None, type=positive, dest="max_intron_length", + help="Maximum intron length for transcripts. Default: 1,000,000 bps.") parser.add_argument("-p", "--procs", help="Number of processors to use (default %(default)s)", type=to_cpu_count, default=None) diff --git a/Mikado/tests/test_splitting.py b/Mikado/tests/test_splitting.py index 16245eb4e..8a18467d6 100644 --- a/Mikado/tests/test_splitting.py +++ b/Mikado/tests/test_splitting.py @@ -4,11 +4,12 @@ import operator import unittest from sys import version_info - +import pysam # import Mikado from .. import loci, parsers, utilities, configuration from ..transcripts.transcript_methods import splitting - +import tempfile +from ..parsers import bed12 if version_info.minor < 5: from sortedcontainers import SortedDict else: @@ -474,5 +475,55 @@ def testNegative(self): self.assertEqual(self.transcript.selected_cds_start, 4000) self.assertEqual(self.transcript.strand, "-") + def test_negative_orf_gtg(self): + + fasta = """>sex_morph_FW.stringtie_sex_morph_FW_str.232.2 +CACAGTCTCGTGCGGCTATTTTCGTCCGCCGCCTGTCCCTCTAAGAAGAGTTTAAGCTCC +TGGGAGCCGGCGGTAGCCCTAGTAACGTATCGTGATCCGCCGGCGACGGCCGCGAACGCG +GCGCGCTTCACCACGGAGCCCCAGGCACGACAGACGCACGCCGACCAGGGGATGAACCCC +GGCCGACGCACGCCCGCCGCACGCAGGGACGCGGATGGCGCGGCCGCGCCCGACGACCGC +CGTGGACGACGGGCGAACGCGTTCGGGGATACCGGGCCGAGCCGACGGGAACGCGAACAC +GGACGGCCGAAACCGCCCGCGCCGCGCCCACCGCCGACCCGGGTTTACCCGCCTAGTTAG +CAGGACAGAGTCTCGTTCGTTATCGGAATTAACCAGACAGATCGCTCCACCAACTAAGAA +CGGCCATGCACCACCACCCACCGAATCAAGAAAGAGCTCTCAATCTGTCAATCTTTCCGG +TGTCCGGGCCTGGTGAGGTTTCCCGTGTTGAGTCAAATTAAGCCGCAGGCTCCACTCCTG +GTGGTGCCCTTCCGTCAATTCCTTTAAGTTTCAACTTTGCAATCATACTTCCCCCGGAAC +CGAAAAGCTTCGGTTTCCCGGAAGCTGCCCGCCGGGTCGTTAATGAAACGCCGGCGGATC +GCTAGCTGGCATCGTTTACAGTTAGAACTAGGGCGGTATCTGATCGCCTTCGAACCTCTA +ACTTTCGTTCTTGATCATACGAGAACGTACTTGGCAAATGCTTTCGCGTCAGTTCGTCTC +GAGACGATCCAAGAATTTCACCTCTAACGTCTCGGTACGAATGCCCCCGCCCGTCTCTGT +TGATCATTACCCCGGAGGGCGATTTCGCGCGCCCGCGAAGGGCGGAGATGCGCGGGACCA +AGGTCTTGTTCCATTATTCCATGCGACCAGTATTCAGGGCCTTTTGACGAGACGGCCGTG +AAGCCGCCCCGCCAGATTTGAGCCTGCTTTGAGCACTCTAATTTGTTCAAAGTAAACGTG +TCGGCCCGCCGACGGCACTCGGTGAAGAGCACCGCGCAGCAAGATTGGAGTAGGCGGCCG +CCGTCGTCGAACCCCGACGGCCGCGCGACGCGTGGCCGCGCGGCGCGCCGGAAGCACGAG +ACACGTGTCCGCC""" + + prodigal = "\t".join(["sex_morph_FW.stringtie_sex_morph_FW_str.232.2", + "Prodigal_v2.6.3", + "CDS", "934","1137", "5.7", "-", "0", + "ID=12199_3;partial=00;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.657;conf=78.73;score=5.69;cscore=7.02;sscore=-1.33;rscore=0.33;uscore=-0.91;tscore=-0.75;" + ]) + temp_fa = tempfile.NamedTemporaryFile(mode="wt", suffix=".fa") + temp_gff = tempfile.NamedTemporaryFile(mode="wt", suffix=".gff3") + print("##gff-version\t3", file=temp_gff) + print(prodigal, file=temp_gff) + print(fasta, file=temp_fa) + temp_gff.flush() + temp_fa.flush() + handle = open(temp_gff.name, mode="rt") + fasta_index = pysam.FastaFile(temp_fa.name) + bed12_parser = bed12.Bed12Parser(handle, + fasta_index=fasta_index, + is_gff=True, + transcriptomic=True, + max_regression=1) + record = next(bed12_parser) + self.assertIsInstance(record, bed12.BED12) + self.assertFalse(record.has_start_codon) + self.assertFalse(record.invalid) + self.assertEqual(record.phase, 1, record) + + if __name__ == "__main__": unittest.main() diff --git a/Mikado/tests/test_system_calls.py b/Mikado/tests/test_system_calls.py index 15cb66f3b..93c81c0c8 100644 --- a/Mikado/tests/test_system_calls.py +++ b/Mikado/tests/test_system_calls.py @@ -17,6 +17,8 @@ from ..configuration import configurator, daijin_configurator from ..picking import picker from ..preparation import prepare +from ..parsers import to_gff +from ..exceptions import InvalidJson from ..scales.compare import compare, load_index from ..subprograms.util.stats import Calculator from ..subprograms.prepare import prepare_launcher @@ -108,15 +110,6 @@ def setUpClass(cls): cls.maxDiff = None - # @classmethod - # def tearDownClass(cls): - # """""" - # - # cls.__genomefile__.close() - # os.remove(cls.__genomefile__.name) - # if os.path.exists("{}.fai".format(cls.__genomefile__.name)): - # os.remove("{}.fai".format(cls.__genomefile__.name)) - def setUp(self): self.conf = configurator.to_json(None) @@ -129,6 +122,36 @@ def setUp(self): def tearDown(self): logging.shutdown() + @mark.slow + def test_varying_max_intron(self): + + self.conf["prepare"]["files"]["labels"].append("tr") + dir = tempfile.TemporaryDirectory() + self.conf["prepare"]["files"]["output_dir"] = dir.name + args = Namespace() + args.json_conf = self.conf + test_file = "trinity.gtf" + self.conf["prepare"]["files"]["gff"] = [pkg_resources.resource_filename("Mikado.tests", + test_file)] + + for max_intron in (20, 200, 1000, 5000): + with self.subTest(max_intron=max_intron): + self.conf["prepare"]["max_intron_length"] = max_intron + prepare.prepare(args, self.logger) + gtf = os.path.join(self.conf["prepare"]["files"]["output_dir"], "mikado_prepared.gtf") + self.assertGreater(os.stat(gtf).st_size, 0, test_file) + transcripts = dict() + for row in to_gff(gtf): + if row.is_transcript: + transcripts[row.transcript] = Transcript(row) + else: + transcripts[row.transcript].add_exon(row) + self.assertGreater(len(transcripts), 0) + [_.finalize() for _ in transcripts.values()] + self.assertLessEqual(max([_.max_intron_length for _ in transcripts.values()]), + max_intron) + os.remove(gtf) + def test_prepare_trinity_gff(self): self.conf["prepare"]["files"]["labels"].append("tr") diff --git a/environment.yml b/environment.yml index 112b8cfdb..8658e4f4b 100644 --- a/environment.yml +++ b/environment.yml @@ -21,7 +21,7 @@ dependencies: - conda-forge::biopython>=1.73 - conda-forge::intervaltree>=3.0.2 - conda-forge::pytest>=4.4.0 - - bioconda::pyfaidx>=0.5.3 + - bioconda::pyfaidx>=0.5.5.2 - conda-forge::python-magic>=0.4.15 - bioconda::snakemake-minimal>=5.4.4 - conda-forge::drmaa>=0.7.7