Genentech · mkabza · Jan 25, 2024 · Jan 24, 2024 · Jan 24, 2024 · Jan 24, 2024
diff --git a/R/assign_intron_strand.R b/R/assign_intron_strand.R
@@ -8,6 +8,8 @@
 #' @param genome_fasta_file A string containing a genome FASTA file path.
 #' @param min_intron_length An integer scalar specifying the minimal length
 #' of introns to assign strand to.
+#' @param max_intron_length An integer scalar specifying the maximum length
+#' of introns to assign strand to.
 #' @param known_intron_motifs A character vector specifying the known intron
 #' motifs.
 #' @param rescue_annotated_introns A logical scalar specifying if introns
@@ -20,6 +22,7 @@ assign_intron_strand <- function(intron_granges,
                                  anno_data,
                                  genome_fasta_file,
                                  min_intron_length = 30,
+                                 max_intron_length = 5e6,
                                  known_intron_motifs = c("GT-AG"),
                                  rescue_annotated_introns = FALSE) {
 
@@ -31,6 +34,7 @@ assign_intron_strand <- function(intron_granges,
     assertthat::assert_that(assertthat::is.string(genome_fasta_file))
     assertthat::assert_that(file.exists(genome_fasta_file))
     assertthat::assert_that(assertthat::is.count(min_intron_length))
+    assertthat::assert_that(assertthat::is.count(max_intron_length))
     assertthat::assert_that(is.character(known_intron_motifs))
     assertthat::assert_that(assertthat::is.flag(rescue_annotated_introns))
 
@@ -58,7 +62,9 @@ assign_intron_strand <- function(intron_granges,
     intron_seq_plus <- BSgenome::getSeq(genome_seq, intron_granges_plus)
     intron_lenght_plus <- BiocGenerics::width(intron_seq_plus)
     intron_seq_plus[intron_lenght_plus < min_intron_length] <-
-        strrep("N", min_intron_length)
+        strrep("N", 4)
+    intron_seq_plus[intron_lenght_plus > max_intron_length] <-
+        strrep("N", 4)
     intron_motif_plus <- glue::glue(
         "{XVector::subseq(intron_seq_plus, start = 1, width = 2)}", "-",
         "{XVector::subseq(intron_seq_plus, end = BiocGenerics::width(intron_seq_plus), width = 2)}"
@@ -70,7 +76,9 @@ assign_intron_strand <- function(intron_granges,
     intron_seq_minus <- BSgenome::getSeq(genome_seq, intron_granges_minus)
     intron_lenght_minus <- BiocGenerics::width(intron_seq_minus)
     intron_seq_minus[intron_lenght_minus < min_intron_length] <-
-        strrep("N", min_intron_length)
+        strrep("N", 4)
+    intron_seq_minus[intron_lenght_minus > max_intron_length] <-
+        strrep("N", 4)
     intron_motif_minus <- glue::glue(
         "{XVector::subseq(intron_seq_minus, start = 1, width = 2)}", "-",
         "{XVector::subseq(intron_seq_minus, end = BiocGenerics::width(intron_seq_minus), width = 2)}"

diff --git a/R/bam_to_read_structures.R b/R/bam_to_read_structures.R
@@ -39,7 +39,12 @@ bam_to_read_structures <- function(bam_files,
         bam_file_con <- Rsamtools::BamFile(bam_file, yieldSize = chunk_size)
         bam_param <- Rsamtools::ScanBamParam(
             what = "qname",
-            flag = Rsamtools::scanBamFlag(isSupplementaryAlignment = FALSE)
+            flag = Rsamtools::scanBamFlag(
+                isSupplementaryAlignment = FALSE,
+                isSecondaryAlignment = FALSE,
+                isNotPassingQualityControls = FALSE,
+                isDuplicate = FALSE
+            )
         )
         open(bam_file_con)
         repeat {
@@ -96,6 +101,17 @@ bam_to_read_structures <- function(bam_files,
     }, BPPARAM = BPPARAM)
     read_summary <- do.call(rbind, read_summary)
 
+    # Remove inter-chromosomal alignments
+    is_same_seq_name <- sapply(
+        strsplit(read_summary$intron_positions, ","),
+        function(intron_positions) {
+            seq_names <- sapply(strsplit(intron_positions, ":"), "[", 1)
+            same_seq_name <- length(unique(seq_names)) == 1
+            return(same_seq_name)
+        }
+    )
+    read_summary <- read_summary[is_same_seq_name,]
+
     # Merge read structures from different BAM files
     read_summary <- read_summary %>%
         dplyr::group_by(.data$intron_positions) %>%

diff --git a/R/bam_to_tcc.R b/R/bam_to_tcc.R
@@ -156,7 +156,12 @@ bam_to_tcc <- function(bam_files,
         }
         bam_param <- Rsamtools::ScanBamParam(
             what = "qname",
-            flag = Rsamtools::scanBamFlag(isSupplementaryAlignment = FALSE),
+            flag = Rsamtools::scanBamFlag(
+                isSupplementaryAlignment = FALSE,
+                isSecondaryAlignment = FALSE,
+                isNotPassingQualityControls = FALSE,
+                isDuplicate = FALSE
+            ),
             tag = bam_tags
         )
         open(bam_file_con)

diff --git a/R/prepare_bam_transcripts.R b/R/prepare_bam_transcripts.R
@@ -9,6 +9,8 @@
 #' @param genome_fasta_file A string containing a genome FASTA file path.
 #' @param min_intron_length An integer scalar specifying the minimal length
 #' of introns to assign strand to.
+#' @param max_intron_length An integer scalar specifying the maximum length
+#' of introns to assign strand to.
 #' @param known_intron_motifs A character vector specifying the known intron
 #' motifs.
 #' @param rescue_annotated_introns A logical scalar specifying if introns
@@ -34,6 +36,7 @@ prepare_bam_transcripts <- function(bam_parsed,
                                     anno_data,
                                     genome_fasta_file,
                                     min_intron_length = 30,
+                                    max_intron_length = 5e6,
                                     known_intron_motifs = c("GT-AG"),
                                     rescue_annotated_introns = FALSE,
                                     known_intron_granges = NULL,
@@ -42,6 +45,11 @@ prepare_bam_transcripts <- function(bam_parsed,
 
     # Check arguments
     assertthat::assert_that(is.data.frame(bam_parsed))
+    assertthat::assert_that(
+        length(bam_parsed$intron_positions) ==
+            length(unique(bam_parsed$intron_positions)),
+        msg = "bam_parsed$intron_positions contains non-unique values"
+    )
     assertthat::assert_that(is.list(anno_data))
     assertthat::assert_that(assertthat::has_name(anno_data, "gene_df"))
     assertthat::assert_that(is.data.frame(anno_data$gene_df))
@@ -54,6 +62,7 @@ prepare_bam_transcripts <- function(bam_parsed,
     assertthat::assert_that(assertthat::is.string(genome_fasta_file))
     assertthat::assert_that(file.exists(genome_fasta_file))
     assertthat::assert_that(assertthat::is.count(min_intron_length))
+    assertthat::assert_that(assertthat::is.count(max_intron_length))
     assertthat::assert_that(is.character(known_intron_motifs))
     assertthat::assert_that(assertthat::is.flag(rescue_annotated_introns))
     if (!is.null(known_intron_granges)) {
@@ -83,6 +92,7 @@ prepare_bam_transcripts <- function(bam_parsed,
                                        anno_data = anno_data,
                                        genome_fasta_file = genome_fasta_file,
                                        min_intron_length = min_intron_length,
+                                       max_intron_length = max_intron_length,
                                        known_intron_motifs = known_intron_motifs,
                                        rescue_annotated_introns = rescue_annotated_introns)
     nr_intron_positions <- intron_data$nr_intron_positions

diff --git a/R/prepare_transcripts.R b/R/prepare_transcripts.R
@@ -10,6 +10,8 @@
 #' only reference transcripts are used.
 #' @param min_intron_length An integer scalar specifying the minimal length
 #' of introns to assign strand to.
+#' @param max_intron_length An integer scalar specifying the maximum length
+#' of introns to assign strand to.
 #' @param known_intron_motifs A character vector specifying the known intron
 #' motifs.
 #' @param rescue_annotated_introns A logical scalar specifying if introns
@@ -37,6 +39,7 @@ prepare_transcripts <- function(gtf_file,
                                 genome_fasta_file,
                                 bam_parsed,
                                 min_intron_length = 30,
+                                max_intron_length = 5e6,
                                 known_intron_motifs = c("GT-AG"),
                                 rescue_annotated_introns = FALSE,
                                 known_intron_granges = NULL,
@@ -51,8 +54,14 @@ prepare_transcripts <- function(gtf_file,
     assertthat::assert_that(file.exists(genome_fasta_file))
     if (!is.null(bam_parsed)) {
         assertthat::assert_that(is.data.frame(bam_parsed))
+        assertthat::assert_that(
+            length(bam_parsed$intron_positions) ==
+                length(unique(bam_parsed$intron_positions)),
+            msg = "bam_parsed$intron_positions contains non-unique values"
+        )
     }
     assertthat::assert_that(assertthat::is.count(min_intron_length))
+    assertthat::assert_that(assertthat::is.count(max_intron_length))
     assertthat::assert_that(is.character(known_intron_motifs))
     assertthat::assert_that(assertthat::is.flag(rescue_annotated_introns))
     if (!is.null(known_intron_granges)) {
@@ -96,7 +105,9 @@ prepare_transcripts <- function(gtf_file,
     if (!is.null(bam_parsed)) {
         tx_list_bam <- prepare_bam_transcripts(
             bam_parsed = bam_parsed, anno_data = anno_data,
-            genome_fasta_file = genome_fasta_file, min_intron_length = min_intron_length,
+            genome_fasta_file = genome_fasta_file,
+            min_intron_length = min_intron_length,
+            max_intron_length = max_intron_length,
             known_intron_motifs = known_intron_motifs,
             rescue_annotated_introns = rescue_annotated_introns,
             known_intron_granges = known_intron_granges,

diff --git a/R/process_intron_data.R b/R/process_intron_data.R
@@ -5,6 +5,7 @@ process_intron_data <- function(bam_parsed,
                                 anno_data,
                                 genome_fasta_file,
                                 min_intron_length = 30,
+                                max_intron_length = 5e6,
                                 known_intron_motifs = c("GT-AG"),
                                 rescue_annotated_introns = FALSE) {
 
@@ -18,6 +19,7 @@ process_intron_data <- function(bam_parsed,
     assertthat::assert_that(assertthat::is.string(genome_fasta_file))
     assertthat::assert_that(file.exists(genome_fasta_file))
     assertthat::assert_that(assertthat::is.count(min_intron_length))
+    assertthat::assert_that(assertthat::is.count(max_intron_length))
     assertthat::assert_that(is.character(known_intron_motifs))
     assertthat::assert_that(assertthat::is.flag(rescue_annotated_introns))
 
@@ -37,6 +39,7 @@ process_intron_data <- function(bam_parsed,
                                               anno_data = anno_data,
                                               genome_fasta_file = genome_fasta_file,
                                               min_intron_length = min_intron_length,
+                                              max_intron_length = max_intron_length,
                                               known_intron_motifs = known_intron_motifs,
                                               rescue_annotated_introns = rescue_annotated_introns)
     nr_intron_positions <- as.character(nr_intron_granges)

diff --git a/docs/Isosceles.html b/docs/Isosceles.html
diff --git a/docs/Isosceles.pdf b/docs/Isosceles.pdf
diff --git a/docs/Mouse_E18_brain_analysis.html b/docs/Mouse_E18_brain_analysis.html