diff --git a/bin/pyroe b/bin/pyroe index 7333349..9c68cd0 100755 --- a/bin/pyroe +++ b/bin/pyroe @@ -100,11 +100,6 @@ if __name__ == "__main__": action="store_true", help="A flag indicates whether flank lengths will be considered when merging introns.", ) - parser_makeSplici.add_argument( - "--write-clean-gtf", - action="store_true", - help="A flag indicates whether a clean gtf will be written if encountered invalid records.", - ) # make-spliceu parser_makeSpliceu = subparsers.add_parser( @@ -159,11 +154,6 @@ if __name__ == "__main__": action="store_true", help="A flag indicates whether identical sequences will be deduplicated.", ) - parser_makeSpliceu.add_argument( - "--write-clean-gtf", - action="store_true", - help="A flag indicates whether a clean gtf will be written if encountered invalid records.", - ) # parse available datasets available_datasets = fetch_processed_quant() @@ -274,7 +264,6 @@ if __name__ == "__main__": no_bt=args.no_bt, bt_path=args.bt_path, no_flanking_merge=args.no_flanking_merge, - write_clean_gtf=args.write_clean_gtf, ) elif args.command == "make-spliceu": make_spliceu_txome( @@ -287,7 +276,6 @@ if __name__ == "__main__": dedup_seqs=args.dedup_seqs, no_bt=args.no_bt, bt_path=args.bt_path, - write_clean_gtf=args.write_clean_gtf, ) elif args.command == "fetch-quant": fetch_processed_quant( diff --git a/src/pyroe/make_txome.py b/src/pyroe/make_txome.py index 5310535..308b8b1 100644 --- a/src/pyroe/make_txome.py +++ b/src/pyroe/make_txome.py @@ -12,6 +12,16 @@ from packaging.version import parse as parse_version import logging +bed_required_fields = [ + "Chromosome", + "Start", + "End", + "Strand", + "Name", + "Gene", + "splice_status", +] + def append_extra(extra_infile, out_fa, out_t2g3col, id2name_path, col_status): """ @@ -608,7 +618,8 @@ def make_splici_txome( " https://pyranges.readthedocs.io/en/latest/autoapi/pyranges/readers/index.html?highlight=read_gtf#pyranges.readers.read_gtf .", f" The error message was: {str(err)}", ] - ) + ), + exc_info=True, ) # check the validity of gr @@ -677,21 +688,22 @@ def make_splici_txome( # add splice status for introns introns.splice_status = "U" + introns = introns[bed_required_fields] + # get exons exons = gr[gr.Feature == "exon"] exons.Name = exons.transcript_id exons.Gene = exons.gene_id - exons = exons.drop(exons.columns[~exons.columns.isin(introns.columns)].tolist()) exons = exons.sort(["Name", "Start", "End"]) # add splice status for exons exons.splice_status = "S" + # keep only required fields + exons = exons[bed_required_fields] # concat spliced transcripts and introns as splici splici = pr.concat([exons, introns]) - # splici = splici.sort(["Name", "Start", "End", "Gene"]) - # write to files # t2g_3col.tsv splici.df[["Name", "Gene", "splice_status"]].drop_duplicates().to_csv( @@ -762,13 +774,13 @@ def make_splici_txome( if tid2strand[prev_rec.id] == "-": prev_rec = prev_rec.reverse_complement(id=True, description=True) SeqIO.write(prev_rec, out_handle, "fasta") - shutil.rmtree(temp_dir, ignore_errors=True) + # shutil.rmtree(temp_dir, ignore_errors=True) except Exception as err: no_bt = True logging.warning( f" Bedtools failed; Using biopython instead. The error message was: \n{err}" ) - shutil.rmtree(temp_dir, ignore_errors=True) + # shutil.rmtree(temp_dir, ignore_errors=True) if no_bt: with open(out_fa, "w") as out_handle: @@ -867,7 +879,6 @@ def make_spliceu_txome( dedup_seqs=False, no_bt=False, bt_path="bedtools", - write_clean_gtf=False, ): """ Construct the spliceu (spliced + unspliced) transcriptome for alevin-fry. @@ -1023,20 +1034,25 @@ def make_spliceu_txome( # get unspliced unspliced = gr[gr.Feature == "gene"] + # unspliced = gr.boundaries("gene_id") unspliced.Name = unspliced.gene_id + "-I" unspliced.Gene = unspliced.gene_id - # add splice status for unspliced unspliced.splice_status = "U" + # keep only required fields + unspliced = unspliced[bed_required_fields] # get exons exons = gr[gr.Feature == "exon"] exons.Name = exons.transcript_id exons.Gene = exons.gene_id + exons = exons.sort(["Name", "Start", "End"]) # add splice status for exons exons.splice_status = "S" + # keep only required fields + exons = exons[bed_required_fields] # concat spliced transcripts and unspliced as spliceu spliceu = pr.concat([exons, unspliced]) @@ -1052,7 +1068,6 @@ def make_spliceu_txome( # g2g.csv t2g_3col[["Gene", "Gene"]].to_csv(out_g2g, sep="\t", header=False, index=False) - # print(spliceu.head()) tid2strand = dict(zip(spliceu.Name, spliceu.Strand)) # spliceu fasta @@ -1090,7 +1105,7 @@ def make_spliceu_txome( # check return code if bt_r.returncode != 0: - logging.exception("Bedtools failed.") + logging.exception("Bedtools failed.", exc_info=True) # parse temp fasta file to concat exons of each transcript ei_parser = SeqIO.parse(temp_fa, "fasta") @@ -1122,7 +1137,7 @@ def make_spliceu_txome( logging.warning( f" Bedtools failed; Using biopython instead. The error message was: {err}" ) - shutil.rmtree(temp_dir, ignore_errors=True) + # shutil.rmtree(temp_dir, ignore_errors=True) if no_bt: with open(out_fa, "w") as out_handle: