Merge pull request #39 from CCBR/cons-peak-norm

feat: collapse peak p-values in bedtools/merge, reformat with new process
CCBR · Dec 1, 2023 · d1cb7d8 · d1cb7d8
2 parents 0dd8b5e + d0ddb29
commit d1cb7d8
Show file tree

Hide file tree

Showing 34 changed files with 561 additions and 197 deletions.
diff --git a/.gitignore b/.gitignore
@@ -22,3 +22,4 @@ work/
 /.quarto/
 .Rproj.user
 *.Rproj
+.Rbuildignore
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,17 +4,19 @@
 
 - bedops/bedmap (#37)
 - bedtools/map (#37)
-- bedtools/merge (#37)
+- bedtools/merge (#37,#39)
 - bedtools/sort (#37)
 - cat/cat (#37)
 - cat/fastq (#37)
 - custom/combinepeakcounts (#37)
 - custom/consensuspeaks (#37)
-- custom/normalizepeaks (#37)
+- custom/formatmergedbed (#39)
+- custom/normalizepeaks (#37,#39)
+- sort/bed (#39)
 
 ### New subworkflows
 
-- consensus_peaks (#37)
+- consensus_peaks (#37,#39)
 
 ## nf-modules 0.1.0
 

diff --git a/modules/CCBR/bedtools/merge/main.nf b/modules/CCBR/bedtools/merge/main.nf
@@ -6,6 +6,7 @@ process BEDTOOLS_MERGE {
 
     input:
     tuple val(meta), path(bed)
+    val(args)
 
     output:
     tuple val(meta), path('*.bed'), emit: bed
@@ -15,9 +16,7 @@ process BEDTOOLS_MERGE {
     task.ext.when == null || task.ext.when
 
     script:
-    def args = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: "${meta.id}.merged"
-    if ("$bed" == "${prefix}.bed") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
+    def prefix = "${bed.baseName}.merged"
     """
     bedtools \\
         merge \\
@@ -32,7 +31,7 @@ process BEDTOOLS_MERGE {
     """
 
     stub:
-    def prefix = task.ext.prefix ?: "${meta.id}.merged"
+    def prefix = "${bed.baseName}.merged"
     """
     touch ${prefix}.bed
 

diff --git a/modules/CCBR/bedtools/merge/meta.yml b/modules/CCBR/bedtools/merge/meta.yml
@@ -21,6 +21,9 @@ input:
       type: file
       description: Input BED file
       pattern: "*.{bed}"
+  - args:
+      type: string
+      description: optional arguments for bedtools merge
 output:
   - meta:
       type: map

diff --git a/modules/CCBR/bedtools/sort/main.nf b/modules/CCBR/bedtools/sort/main.nf
@@ -17,7 +17,7 @@ process BEDTOOLS_SORT {
 
     script:
     def args       = task.ext.args   ?: ''
-    def prefix     = task.ext.prefix ?: "${meta.id}.sorted"
+    def prefix     = task.ext.prefix ?: "${intervals.baseName}.sorted"
     def genome_cmd = genome_file     ?  "-g $genome_file" : ""
     extension      = task.ext.suffix ?: intervals.extension
     if ("$intervals" == "${prefix}.${extension}") {

diff --git a/modules/CCBR/cat/cat/main.nf b/modules/CCBR/cat/cat/main.nf
@@ -17,7 +17,7 @@ process CAT_CAT {
     script:
     def args = task.ext.args ?: ''
     def args2 = task.ext.args2 ?: ''
-    def file_list = files_in.collect { it.toString() }
+    def file_list = files_in.sort({ a, b -> a.baseName <=> b.baseName }).collect{ it.toString() }
 
     // | input     | output     | command1 | command2 |
     // |-----------|------------|----------|----------|

diff --git a/modules/CCBR/custom/combinepeakcounts/main.nf b/modules/CCBR/custom/combinepeakcounts/main.nf
@@ -24,9 +24,10 @@ process CUSTOM_COMBINEPEAKCOUNTS {
     template 'combine_peaks.R'
 
     stub:
-    def prefix = task.ext.prefix ?: "${meta.id}"
+    prefix = task.ext.prefix ?: "${meta.id}"
+    outfile = "${prefix}.consensus.bed"
     """
-    touch ${prefix}.consensus.bed
+    touch ${outfile}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/CCBR/custom/combinepeakcounts/templates/combine_peaks.R b/modules/CCBR/custom/combinepeakcounts/templates/combine_peaks.R
@@ -5,17 +5,19 @@ library(stringr)
 library(readr)
 library(tidyr)
 
-main <- function() {
-  write_lines(get_version(), "versions.yml")
-  dat <- combine_peaks(unlist(str_split("${count_files}", ",")))
-  write_tsv(dat, "${outfile}", col_names = FALSE)
+main <- function(version_file = "versions.yml",
+                 count_files = unlist(str_split("${count_files}", ",")),
+                 out_file = "${outfile}") {
+  write_lines(get_version(), version_file)
+  dat <- combine_peak_counts(count_files)
+  write_tsv(dat, out_file, col_names = FALSE)
 }
 
 get_version <- function() {
   return(paste0(R.version[["major"]], ".", R.version[["minor"]]))
 }
 
-combine_peaks <- function(count_files) {
+combine_peak_counts <- function(count_files) {
   count_dat <- count_files %>%
     map(function(file) {
       dat <- read_tsv(file, col_names = FALSE)
@@ -35,12 +37,22 @@ combine_peaks <- function(count_files) {
   return(count_dat)
 }
 
-join_peaks <- function(peakfiles) {
-  return()
-}
-
-normalize_scores <- function(dat) {
-  return()
+read_peaks <- function(peak_file) {
+  peak_colnames <- c(
+    "chrom",
+    "start",
+    "end",
+    "peakID",
+    "score",
+    "strand",
+    "signal",
+    "pvalue",
+    "qvalue",
+    "peak"
+  )
+  peaks <- read_tsv(peak_file, col_names = FALSE)
+  colnames(peaks) <- peak_colnames[seq_len(ncol(peaks))]
+  return(peaks)
 }
 
 main()
diff --git a/modules/CCBR/custom/formatmergedbed/main.nf b/modules/CCBR/custom/formatmergedbed/main.nf
@@ -0,0 +1,30 @@
+process CUSTOM_FORMATMERGEDBED {
+    tag { meta.id }
+    label 'process_medium'
+
+    container 'nciccbr/consensus_peaks:v1.1'
+
+    input:
+    tuple val(meta), path(merged_bed)
+
+    output:
+    tuple val(meta), path("*.bed"), emit: bed
+    path "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    outfile = "${merged_bed.baseName}.consensus.bed"
+    template 'format_merged_bed.R'
+
+    stub:
+    """
+    touch ${merged_bed.baseName}.consensus.bed
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        : \$(echo \$(R --version | grep 'R version' | sed 's/R version //; s/ (.*//'))
+    END_VERSIONS
+    """
+}
diff --git a/modules/CCBR/custom/formatmergedbed/meta.yml b/modules/CCBR/custom/formatmergedbed/meta.yml
@@ -0,0 +1,50 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json
+name: "custom_formatmergedbed"
+description: |
+  Reformat consensus peaks from bedtools merge.
+  Used in the consensus_peaks subworkflow.
+keywords:
+  - chipseq
+  - peaks
+  - consensus
+  - bedtools
+tools:
+  - "R":
+      description: "R is a free software environment for statistical computing and graphics"
+      homepage: "https://www.r-project.org/"
+      licence: ["GPL-3"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. `[ id:'test', single_end:false ]`
+  - merged_bed:
+      type: file
+      description: |
+        Merged output file from calling
+        `bedtools merge -c 1,5,6,7,8,9 -o count,collapse,collapse,collapse,collapse,collapse`
+        on a concatenated & sorted peak file
+      pattern: "*.bed"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. `[ id:'test', single_end:false ]`
+  - bed:
+      type: file
+      description: |
+        A narrow peak bed file with the best p-value for each consensus peak
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+
+authors:
+  - "@kelly-sovacool"
+maintainers:
+  - "@kelly-sovacool"
diff --git a/modules/CCBR/custom/formatmergedbed/templates/format_merged_bed.R b/modules/CCBR/custom/formatmergedbed/templates/format_merged_bed.R
@@ -0,0 +1,95 @@
+#!/usr/bin/env Rscript
+library(dplyr)
+library(glue)
+library(purrr)
+library(stringr)
+library(readr)
+library(tidyr)
+
+main <- function(version_file = "versions.yml",
+                 merged_file = "${merged_bed}",
+                 out_file = "${outfile}",
+                 n_cores = as.integer("${task.cpus}"),
+                 min_count = 1) {
+  doFuture::registerDoFuture()
+  future::plan(future::multicore, workers = n_cores)
+  write_version(version_file)
+  merged_dat <- read_tsv(merged_file,
+    col_names = FALSE,
+    col_types = "ciiiccccc"
+  )
+  if (nrow(merged_dat) == 0) {
+    stop("The merged bed file is empty")
+  }
+  colnames(merged_dat) <- c(
+    "chrom", "start", "end",
+    "counts", "score_cat", "strand_cat",
+    "signal_cat", "pvalue_cat", "qvalue_cat"
+  )
+  merged_dat %>%
+    filter(counts >= min_count) %>%
+    future.apply::future_apply(1, select_best_peak) %>%
+    bind_rows() %>%
+    select(
+      "chrom",
+      "start",
+      "end",
+      "peakID",
+      "score",
+      "strand",
+      "signal",
+      "pvalue",
+      "qvalue"
+    ) %>%
+    write_tsv(out_file, col_names = FALSE)
+}
+
+select_best_peak <- function(dat_row) {
+  return(
+    dat_row %>%
+      vec_to_df() %>%
+      pivot_collapsed_columns() %>%
+      slice_max(pvalue)
+  )
+}
+
+vec_to_df <- function(vec) {
+  vec %>%
+    as.list() %>%
+    as_tibble()
+}
+
+pivot_collapsed_columns <- function(dat_row) {
+  row_pivot <- dat_row %>%
+    select(ends_with("_cat")) %>%
+    t() %>%
+    as.data.frame()
+  long_row <- row_pivot %>%
+    mutate(names = rownames(row_pivot)) %>%
+    separate_wider_delim(V1, delim = ",", names_sep = "_") %>%
+    mutate(names = str_replace(names, "_cat", "")) %>%
+    pivot_longer(starts_with("V1")) %>%
+    pivot_wider(names_from = names, values_from = value) %>%
+    select(-name)
+  return(
+    dat_row %>%
+      select(-ends_with("cat")) %>%
+      bind_cols(long_row) %>%
+      mutate(across(c("start", "end", "counts"), as.integer)) %>%
+      mutate(peakID = glue("{chrom}:{start}-{end}")) %>%
+      mutate(across(c("score", "signal", "pvalue", "qvalue"), as.numeric))
+  )
+}
+
+write_version <- function(version_file) {
+  write_lines(get_version(), version_file)
+}
+
+get_version <- function() {
+  return(paste0(R.version[["major"]], ".", R.version[["minor"]]))
+}
+
+
+main("versions.yml", "${merged_bed}", "${outfile}",
+  n_cores = as.integer("${task.cpus}")
+)
diff --git a/modules/CCBR/custom/normalizepeaks/main.nf b/modules/CCBR/custom/normalizepeaks/main.nf
@@ -8,7 +8,7 @@ process CUSTOM_NORMALIZEPEAKS {
     container 'nciccbr/spacesavers2:0.1.1'
 
     input:
-    tuple val(meta), path(count), path(peaks)
+    tuple val(meta), path(peak)
 
     output:
     tuple val(meta), path("*norm.bed"), emit: bed
@@ -19,12 +19,12 @@ process CUSTOM_NORMALIZEPEAKS {
 
     script:
     prefix = task.ext.prefix ?: "${meta.id}"
-    outfile = "${count.baseName}.norm.bed"
+    outfile = "${peak}.norm.bed"
     template 'normalize_peaks.R'
 
     stub:
     """
-    touch ${count.baseName}.norm.bed
+    touch ${peak}.norm.bed
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":