Merge pull request #23 from DongzeHE/develop

fix: refine bed file formats
COMBINE-lab · Feb 7, 2023 · a783cbe · a783cbe
2 parents c61e805 + 97814a1
commit a783cbe
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 23 deletions.
diff --git a/bin/pyroe b/bin/pyroe
@@ -100,11 +100,6 @@ if __name__ == "__main__":
         action="store_true",
         help="A flag indicates whether flank lengths will be considered when merging introns.",
     )
-    parser_makeSplici.add_argument(
-        "--write-clean-gtf",
-        action="store_true",
-        help="A flag indicates whether a clean gtf will be written if encountered invalid records.",
-    )
 
     # make-spliceu
     parser_makeSpliceu = subparsers.add_parser(
@@ -159,11 +154,6 @@ if __name__ == "__main__":
         action="store_true",
         help="A flag indicates whether identical sequences will be deduplicated.",
     )
-    parser_makeSpliceu.add_argument(
-        "--write-clean-gtf",
-        action="store_true",
-        help="A flag indicates whether a clean gtf will be written if encountered invalid records.",
-    )
 
     # parse available datasets
     available_datasets = fetch_processed_quant()
@@ -274,7 +264,6 @@ if __name__ == "__main__":
             no_bt=args.no_bt,
             bt_path=args.bt_path,
             no_flanking_merge=args.no_flanking_merge,
-            write_clean_gtf=args.write_clean_gtf,
         )
     elif args.command == "make-spliceu":
         make_spliceu_txome(
@@ -287,7 +276,6 @@ if __name__ == "__main__":
             dedup_seqs=args.dedup_seqs,
             no_bt=args.no_bt,
             bt_path=args.bt_path,
-            write_clean_gtf=args.write_clean_gtf,
         )
     elif args.command == "fetch-quant":
         fetch_processed_quant(

diff --git a/src/pyroe/make_txome.py b/src/pyroe/make_txome.py
@@ -12,6 +12,16 @@
 from packaging.version import parse as parse_version
 import logging
 
+bed_required_fields = [
+    "Chromosome",
+    "Start",
+    "End",
+    "Strand",
+    "Name",
+    "Gene",
+    "splice_status",
+]
+
 
 def append_extra(extra_infile, out_fa, out_t2g3col, id2name_path, col_status):
     """
@@ -608,7 +618,8 @@ def make_splici_txome(
                     " https://pyranges.readthedocs.io/en/latest/autoapi/pyranges/readers/index.html?highlight=read_gtf#pyranges.readers.read_gtf .",
                     f" The error message was: {str(err)}",
                 ]
-            )
+            ),
+            exc_info=True,
         )
 
     # check the validity of gr
@@ -677,21 +688,22 @@ def make_splici_txome(
     # add splice status for introns
     introns.splice_status = "U"
 
+    introns = introns[bed_required_fields]
+
     # get exons
     exons = gr[gr.Feature == "exon"]
 
     exons.Name = exons.transcript_id
     exons.Gene = exons.gene_id
-    exons = exons.drop(exons.columns[~exons.columns.isin(introns.columns)].tolist())
     exons = exons.sort(["Name", "Start", "End"])
     # add splice status for exons
     exons.splice_status = "S"
+    # keep only required fields
+    exons = exons[bed_required_fields]
 
     # concat spliced transcripts and introns as splici
     splici = pr.concat([exons, introns])
 
-    # splici = splici.sort(["Name", "Start", "End", "Gene"])
-
     # write to files
     # t2g_3col.tsv
     splici.df[["Name", "Gene", "splice_status"]].drop_duplicates().to_csv(
@@ -762,13 +774,13 @@ def make_splici_txome(
                 if tid2strand[prev_rec.id] == "-":
                     prev_rec = prev_rec.reverse_complement(id=True, description=True)
                 SeqIO.write(prev_rec, out_handle, "fasta")
-            shutil.rmtree(temp_dir, ignore_errors=True)
+            # shutil.rmtree(temp_dir, ignore_errors=True)
         except Exception as err:
             no_bt = True
             logging.warning(
                 f" Bedtools failed; Using biopython instead. The error message was: \n{err}"
             )
-            shutil.rmtree(temp_dir, ignore_errors=True)
+            # shutil.rmtree(temp_dir, ignore_errors=True)
 
     if no_bt:
         with open(out_fa, "w") as out_handle:
@@ -867,7 +879,6 @@ def make_spliceu_txome(
     dedup_seqs=False,
     no_bt=False,
     bt_path="bedtools",
-    write_clean_gtf=False,
 ):
     """
     Construct the spliceu (spliced + unspliced) transcriptome for alevin-fry.
@@ -1023,20 +1034,25 @@ def make_spliceu_txome(
 
     # get unspliced
     unspliced = gr[gr.Feature == "gene"]
+    # unspliced = gr.boundaries("gene_id")
     unspliced.Name = unspliced.gene_id + "-I"
     unspliced.Gene = unspliced.gene_id
-
     # add splice status for unspliced
     unspliced.splice_status = "U"
+    # keep only required fields
+    unspliced = unspliced[bed_required_fields]
 
     # get exons
     exons = gr[gr.Feature == "exon"]
 
     exons.Name = exons.transcript_id
     exons.Gene = exons.gene_id
+
     exons = exons.sort(["Name", "Start", "End"])
     # add splice status for exons
     exons.splice_status = "S"
+    # keep only required fields
+    exons = exons[bed_required_fields]
 
     # concat spliced transcripts and unspliced as spliceu
     spliceu = pr.concat([exons, unspliced])
@@ -1052,7 +1068,6 @@ def make_spliceu_txome(
     # g2g.csv
     t2g_3col[["Gene", "Gene"]].to_csv(out_g2g, sep="\t", header=False, index=False)
 
-    # print(spliceu.head())
     tid2strand = dict(zip(spliceu.Name, spliceu.Strand))
 
     # spliceu fasta
@@ -1090,7 +1105,7 @@ def make_spliceu_txome(
 
             # check return code
             if bt_r.returncode != 0:
-                logging.exception("Bedtools failed.")
+                logging.exception("Bedtools failed.", exc_info=True)
 
             # parse temp fasta file to concat exons of each transcript
             ei_parser = SeqIO.parse(temp_fa, "fasta")
@@ -1122,7 +1137,7 @@ def make_spliceu_txome(
             logging.warning(
                 f" Bedtools failed; Using biopython instead. The error message was: {err}"
             )
-            shutil.rmtree(temp_dir, ignore_errors=True)
+            # shutil.rmtree(temp_dir, ignore_errors=True)
 
     if no_bt:
         with open(out_fa, "w") as out_handle: