Skip to content

Commit

Permalink
Merge pull request #23 from DongzeHE/develop
Browse files Browse the repository at this point in the history
fix: refine bed file formats
  • Loading branch information
rob-p committed Feb 7, 2023
2 parents c61e805 + 97814a1 commit a783cbe
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 23 deletions.
12 changes: 0 additions & 12 deletions bin/pyroe
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,6 @@ if __name__ == "__main__":
action="store_true",
help="A flag indicates whether flank lengths will be considered when merging introns.",
)
parser_makeSplici.add_argument(
"--write-clean-gtf",
action="store_true",
help="A flag indicates whether a clean gtf will be written if encountered invalid records.",
)

# make-spliceu
parser_makeSpliceu = subparsers.add_parser(
Expand Down Expand Up @@ -159,11 +154,6 @@ if __name__ == "__main__":
action="store_true",
help="A flag indicates whether identical sequences will be deduplicated.",
)
parser_makeSpliceu.add_argument(
"--write-clean-gtf",
action="store_true",
help="A flag indicates whether a clean gtf will be written if encountered invalid records.",
)

# parse available datasets
available_datasets = fetch_processed_quant()
Expand Down Expand Up @@ -274,7 +264,6 @@ if __name__ == "__main__":
no_bt=args.no_bt,
bt_path=args.bt_path,
no_flanking_merge=args.no_flanking_merge,
write_clean_gtf=args.write_clean_gtf,
)
elif args.command == "make-spliceu":
make_spliceu_txome(
Expand All @@ -287,7 +276,6 @@ if __name__ == "__main__":
dedup_seqs=args.dedup_seqs,
no_bt=args.no_bt,
bt_path=args.bt_path,
write_clean_gtf=args.write_clean_gtf,
)
elif args.command == "fetch-quant":
fetch_processed_quant(
Expand Down
37 changes: 26 additions & 11 deletions src/pyroe/make_txome.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,16 @@
from packaging.version import parse as parse_version
import logging

bed_required_fields = [
"Chromosome",
"Start",
"End",
"Strand",
"Name",
"Gene",
"splice_status",
]


def append_extra(extra_infile, out_fa, out_t2g3col, id2name_path, col_status):
"""
Expand Down Expand Up @@ -608,7 +618,8 @@ def make_splici_txome(
" https://pyranges.readthedocs.io/en/latest/autoapi/pyranges/readers/index.html?highlight=read_gtf#pyranges.readers.read_gtf .",
f" The error message was: {str(err)}",
]
)
),
exc_info=True,
)

# check the validity of gr
Expand Down Expand Up @@ -677,21 +688,22 @@ def make_splici_txome(
# add splice status for introns
introns.splice_status = "U"

introns = introns[bed_required_fields]

# get exons
exons = gr[gr.Feature == "exon"]

exons.Name = exons.transcript_id
exons.Gene = exons.gene_id
exons = exons.drop(exons.columns[~exons.columns.isin(introns.columns)].tolist())
exons = exons.sort(["Name", "Start", "End"])
# add splice status for exons
exons.splice_status = "S"
# keep only required fields
exons = exons[bed_required_fields]

# concat spliced transcripts and introns as splici
splici = pr.concat([exons, introns])

# splici = splici.sort(["Name", "Start", "End", "Gene"])

# write to files
# t2g_3col.tsv
splici.df[["Name", "Gene", "splice_status"]].drop_duplicates().to_csv(
Expand Down Expand Up @@ -762,13 +774,13 @@ def make_splici_txome(
if tid2strand[prev_rec.id] == "-":
prev_rec = prev_rec.reverse_complement(id=True, description=True)
SeqIO.write(prev_rec, out_handle, "fasta")
shutil.rmtree(temp_dir, ignore_errors=True)
# shutil.rmtree(temp_dir, ignore_errors=True)
except Exception as err:
no_bt = True
logging.warning(
f" Bedtools failed; Using biopython instead. The error message was: \n{err}"
)
shutil.rmtree(temp_dir, ignore_errors=True)
# shutil.rmtree(temp_dir, ignore_errors=True)

if no_bt:
with open(out_fa, "w") as out_handle:
Expand Down Expand Up @@ -867,7 +879,6 @@ def make_spliceu_txome(
dedup_seqs=False,
no_bt=False,
bt_path="bedtools",
write_clean_gtf=False,
):
"""
Construct the spliceu (spliced + unspliced) transcriptome for alevin-fry.
Expand Down Expand Up @@ -1023,20 +1034,25 @@ def make_spliceu_txome(

# get unspliced
unspliced = gr[gr.Feature == "gene"]
# unspliced = gr.boundaries("gene_id")
unspliced.Name = unspliced.gene_id + "-I"
unspliced.Gene = unspliced.gene_id

# add splice status for unspliced
unspliced.splice_status = "U"
# keep only required fields
unspliced = unspliced[bed_required_fields]

# get exons
exons = gr[gr.Feature == "exon"]

exons.Name = exons.transcript_id
exons.Gene = exons.gene_id

exons = exons.sort(["Name", "Start", "End"])
# add splice status for exons
exons.splice_status = "S"
# keep only required fields
exons = exons[bed_required_fields]

# concat spliced transcripts and unspliced as spliceu
spliceu = pr.concat([exons, unspliced])
Expand All @@ -1052,7 +1068,6 @@ def make_spliceu_txome(
# g2g.csv
t2g_3col[["Gene", "Gene"]].to_csv(out_g2g, sep="\t", header=False, index=False)

# print(spliceu.head())
tid2strand = dict(zip(spliceu.Name, spliceu.Strand))

# spliceu fasta
Expand Down Expand Up @@ -1090,7 +1105,7 @@ def make_spliceu_txome(

# check return code
if bt_r.returncode != 0:
logging.exception("Bedtools failed.")
logging.exception("Bedtools failed.", exc_info=True)

# parse temp fasta file to concat exons of each transcript
ei_parser = SeqIO.parse(temp_fa, "fasta")
Expand Down Expand Up @@ -1122,7 +1137,7 @@ def make_spliceu_txome(
logging.warning(
f" Bedtools failed; Using biopython instead. The error message was: {err}"
)
shutil.rmtree(temp_dir, ignore_errors=True)
# shutil.rmtree(temp_dir, ignore_errors=True)

if no_bt:
with open(out_fa, "w") as out_handle:
Expand Down

0 comments on commit a783cbe

Please sign in to comment.