From 1df35bdfd7b0f9a39384e3ed7632e6703b01e229 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Thu, 12 Sep 2024 10:45:51 -0500 Subject: [PATCH 01/54] style: Run styler --- bin/parameter_tuning.R | 154 +++++++++++++------------- bin/transcriptcalling_grohmm.R | 192 ++++++++++++++++----------------- 2 files changed, 173 insertions(+), 173 deletions(-) diff --git a/bin/parameter_tuning.R b/bin/parameter_tuning.R index 97bb9000..251cab70 100755 --- a/bin/parameter_tuning.R +++ b/bin/parameter_tuning.R @@ -8,76 +8,76 @@ suppressPackageStartupMessages(library(groHMM)) parser <- ArgumentParser(description = "Run groHMM on some bam files") parser$add_argument( - "-i", - "--bam_files", - type = "character", - nargs = "+", - metavar = "path", - help = "GRO SEQ data in bam files.", - required = TRUE + "-i", + "--bam_files", + type = "character", + nargs = "+", + metavar = "path", + help = "GRO SEQ data in bam files.", + required = TRUE ) parser$add_argument( - "-t", - "--tuning_file", - type = "character", - default = NULL, - metavar = "path", - help = "File with tuning parameters and error rates." + "-t", + "--tuning_file", + type = "character", + default = NULL, + metavar = "path", + help = "File with tuning parameters and error rates." ) parser$add_argument( - "-o", - "--outdir", - type = "character", - default = "./", - metavar = "path", - help = "Output directory." + "-o", + "--outdir", + type = "character", + default = "./", + metavar = "path", + help = "Output directory." ) parser$add_argument( - "-l", - "--ltprobb", - type = "integer", - default = -200, - metavar = "integer", - help = cat( - "Log-transformed transition probability of switching from transcribed + "-l", + "--ltprobb", + type = "integer", + default = -200, + metavar = "integer", + help = cat( + "Log-transformed transition probability of switching from transcribed state to non-transcribed state" - ) + ) ) parser$add_argument( - "-u", - "--uts", - type = "integer", - default = 5, - metavar = "integer", - help = cat( - "Variance of the emission probability for reads in the + "-u", + "--uts", + type = "integer", + default = 5, + metavar = "integer", + help = cat( + "Variance of the emission probability for reads in the non-transcribed state, respectively." - ) + ) ) parser$add_argument( - "-p", - "--outprefix", - type = "character", - default = "grohmm", - metavar = "string", - help = "Output prefix." + "-p", + "--outprefix", + type = "character", + default = "grohmm", + metavar = "string", + help = "Output prefix." ) parser$add_argument( - "-g", - "--gtf", - type = "character", - default = NULL, - metavar = "string", - help = "GTF File to create TxDb", - required = TRUE + "-g", + "--gtf", + type = "character", + default = NULL, + metavar = "string", + help = "GTF File to create TxDb", + required = TRUE ) parser$add_argument( - "-c", - "--cores", - type = "integer", - default = 1, - metavar = "integer", - help = "Number of cores." + "-c", + "--cores", + type = "integer", + default = 1, + metavar = "integer", + help = "Number of cores." ) args <- parser$parse_args() @@ -85,29 +85,29 @@ args <- parser$parse_args() setwd(args$outdir) if (is.null(args$bam_files)) { - print_help(args) - stop("Please provide a bam file", call. = FALSE) + print_help(args) + stop("Please provide a bam file", call. = FALSE) } if (is.null(args$tuning_file)) { - print_help(args) - stop("Please provide a tuning file", call. = FALSE) + print_help(args) + stop("Please provide a tuning file", call. = FALSE) } # Read in bam file. if (file.exists(args$outdir) == FALSE) { - dir.create(args$outdir, recursive = TRUE) + dir.create(args$outdir, recursive = TRUE) } setwd(args$outdir) # CHANGE BASED ON PAIRED OR SINGLE END alignments <- c() for (bam in args$bam_files) { - alignments <- append( - alignments, - as(readGAlignments(bam), "GRanges") - ) - alignments <- keepStandardChromosomes(alignments, pruning.mode = "coarse") + alignments <- append( + alignments, + as(readGAlignments(bam), "GRanges") + ) + alignments <- keepStandardChromosomes(alignments, pruning.mode = "coarse") } print("Input transcript annotations") @@ -115,9 +115,9 @@ kg_db <- makeTxDbFromGFF(args$gtf) kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name")) print("Collapse annotations in preparation for overlap") kg_consensus <- makeConsensusAnnotations( - kg_tx, - keytype = "gene_id", - mc.cores = args$cores + kg_tx, + keytype = "gene_id", + mc.cores = args$cores ) print("Finished consensus annotations") @@ -125,12 +125,12 @@ print("Finished consensus annotations") tune <- read.csv(args$tuning_file) evals <- mclapply(seq_len(nrow(tune)), function(x) { - hmm <- detectTranscripts( - reads = alignments, - LtProbB = tune$LtProbB[x], UTS = tune$UTS[x] - ) - e <- evaluateHMMInAnnotations(hmm$transcripts, kg_consensus) - e$eval + hmm <- detectTranscripts( + reads = alignments, + LtProbB = tune$LtProbB[x], UTS = tune$UTS[x] + ) + e <- evaluateHMMInAnnotations(hmm$transcripts, kg_consensus) + e$eval }, mc.cores = args$cores, mc.silent = TRUE) tune <- cbind(tune, do.call(rbind, evals)) @@ -149,10 +149,10 @@ citation("AnnotationDbi") r_log_file <- "R_sessionInfo.log" if (file.exists(r_log_file) == FALSE) { - sink(r_log_file) - a <- sessionInfo() - print(a) - sink() + sink(r_log_file) + a <- sessionInfo() + print(a) + sink() } ################################################################################ diff --git a/bin/transcriptcalling_grohmm.R b/bin/transcriptcalling_grohmm.R index bdd4c7e7..3c99c8bd 100755 --- a/bin/transcriptcalling_grohmm.R +++ b/bin/transcriptcalling_grohmm.R @@ -8,76 +8,76 @@ suppressPackageStartupMessages(library(groHMM)) parser <- ArgumentParser(description = "Run groHMM on some bam files") parser$add_argument( - "-i", - "--bam_files", - type = "character", - nargs = "+", - metavar = "path", - help = "GRO SEQ data in bam files.", - required = TRUE + "-i", + "--bam_files", + type = "character", + nargs = "+", + metavar = "path", + help = "GRO SEQ data in bam files.", + required = TRUE ) parser$add_argument( - "-t", - "--tuning_file", - type = "character", - default = NULL, - metavar = "path", - help = "File with tuning parameters and error rates." + "-t", + "--tuning_file", + type = "character", + default = NULL, + metavar = "path", + help = "File with tuning parameters and error rates." ) parser$add_argument( - "-o", - "--outdir", - type = "character", - default = "./", - metavar = "path", - help = "Output directory." + "-o", + "--outdir", + type = "character", + default = "./", + metavar = "path", + help = "Output directory." ) parser$add_argument( - "-l", - "--ltprobb", - type = "integer", - default = -200, - metavar = "integer", - help = cat( - "Log-transformed transition probability of switching from transcribed + "-l", + "--ltprobb", + type = "integer", + default = -200, + metavar = "integer", + help = cat( + "Log-transformed transition probability of switching from transcribed state to non-transcribed state" - ) + ) ) parser$add_argument( - "-u", - "--uts", - type = "integer", - default = 5, - metavar = "integer", - help = cat( - "Variance of the emission probability for reads in the + "-u", + "--uts", + type = "integer", + default = 5, + metavar = "integer", + help = cat( + "Variance of the emission probability for reads in the non-transcribed state, respectively." - ) + ) ) parser$add_argument( - "-p", - "--outprefix", - type = "character", - default = "grohmm", - metavar = "string", - help = "Output prefix." + "-p", + "--outprefix", + type = "character", + default = "grohmm", + metavar = "string", + help = "Output prefix." ) parser$add_argument( - "-g", - "--gtf", - type = "character", - default = NULL, - metavar = "string", - help = "GTF File to create TxDb", - required = TRUE + "-g", + "--gtf", + type = "character", + default = NULL, + metavar = "string", + help = "GTF File to create TxDb", + required = TRUE ) parser$add_argument( - "-c", - "--cores", - type = "integer", - default = 1, - metavar = "integer", - help = "Number of cores." + "-c", + "--cores", + type = "integer", + default = 1, + metavar = "integer", + help = "Number of cores." ) args <- parser$parse_args() @@ -87,41 +87,41 @@ setwd(args$outdir) # Load alignment files alignments <- c() for (bam in args$bam_files) { - alignments <- append( - alignments, - as(readGAlignments(bam), "GRanges") - ) - alignments <- keepStandardChromosomes(alignments, pruning.mode = "coarse") + alignments <- append( + alignments, + as(readGAlignments(bam), "GRanges") + ) + alignments <- keepStandardChromosomes(alignments, pruning.mode = "coarse") } # Call annotations > DEFAULT VALUES ASSIGNED if (is.null(args$tuning_file)) { - hmm_result <- detectTranscripts( - alignments, - LtProbB = args$ltprobb, - UTS = args$uts, - threshold = 1 - ) # Uses either inputted or default values + hmm_result <- detectTranscripts( + alignments, + LtProbB = args$ltprobb, + UTS = args$uts, + threshold = 1 + ) # Uses either inputted or default values } else { - tune <- read.csv(args$tuning_file) - # Minimum error - uts <- tune[which.min(tune$errorRate), "UTS"] - lt_probb <- tune[which.min(tune$errorRate), "LtProbB"] - hmm_result <- detectTranscripts( - alignments, - LtProbB = lt_probb, - UTS = uts, - threshold = 1 - ) + tune <- read.csv(args$tuning_file) + # Minimum error + uts <- tune[which.min(tune$errorRate), "UTS"] + lt_probb <- tune[which.min(tune$errorRate), "LtProbB"] + hmm_result <- detectTranscripts( + alignments, + LtProbB = lt_probb, + UTS = uts, + threshold = 1 + ) } tx_hmm <- hmm_result$transcripts write.table( - tx_hmm, - file = paste(args$outprefix, - ".transcripts.txt", - sep = "" - ) + tx_hmm, + file = paste(args$outprefix, + ".transcripts.txt", + sep = "" + ) ) print("Input transcript annotations") @@ -129,9 +129,9 @@ kg_db <- makeTxDbFromGFF(args$gtf) kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name")) print("Collapse annotations in preparation for overlap") kg_consensus <- makeConsensusAnnotations( - kg_tx, - keytype = "gene_id", - mc.cores = args$cores + kg_tx, + keytype = "gene_id", + mc.cores = args$cores ) print("Finished consensus annotations") @@ -144,15 +144,15 @@ capture.output(e$eval, file = paste0(args$outprefix, ".eval.txt")) # repairing with annotations get_expressed_annotations <- function(features, reads) { - f_limit <- limitToXkb(features) - count <- countOverlaps(f_limit, reads) - features <- features[count != 0, ] - return(features[(quantile(width(features), .05) < width(features)) & - (width(features) < quantile(width(features), .95)), ]) + f_limit <- limitToXkb(features) + count <- countOverlaps(f_limit, reads) + features <- features[count != 0, ] + return(features[(quantile(width(features), .05) < width(features)) & + (width(features) < quantile(width(features), .95)), ]) } con_expressed <- get_expressed_annotations( - features = kg_consensus, - reads = alignments + features = kg_consensus, + reads = alignments ) b_plus <- breakTranscriptsOnGenes(tx_hmm, kg_consensus, strand = "+") b_minus <- breakTranscriptsOnGenes(tx_hmm, kg_consensus, strand = "-") @@ -160,8 +160,8 @@ tx_broken <- c(b_plus, b_minus) tx_final <- combineTranscripts(tx_broken, kg_consensus) td_final <- getTxDensity(tx_final, con_expressed, mc.cores = args$cores) export( - tx_final, - con = paste(args$outprefix, ".final.transcripts.bed", sep = "") + tx_final, + con = paste(args$outprefix, ".final.transcripts.bed", sep = "") ) capture.output(td_final, file = paste0(args$outprefix, ".tdFinal.txt")) # Output plot @@ -185,10 +185,10 @@ citation("AnnotationDbi") r_log_file <- "R_sessionInfo.log" if (file.exists(r_log_file) == FALSE) { - sink(r_log_file) - a <- sessionInfo() - print(a) - sink() + sink(r_log_file) + a <- sessionInfo() + print(a) + sink() } ################################################################################ From f7ddbb625425a25648f2687043f84b495c83f820 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Fri, 13 Sep 2024 11:06:33 -0500 Subject: [PATCH 02/54] test: Write grohmm tests --- .../grohmm/skip_tuning/main.nf.test | 49 +++++++++++++++++++ .../grohmm/tuning/main.nf.test | 48 ++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 workflows/tests/transcript_indentification/grohmm/skip_tuning/main.nf.test create mode 100644 workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test diff --git a/workflows/tests/transcript_indentification/grohmm/skip_tuning/main.nf.test b/workflows/tests/transcript_indentification/grohmm/skip_tuning/main.nf.test new file mode 100644 index 00000000..b9d975bd --- /dev/null +++ b/workflows/tests/transcript_indentification/grohmm/skip_tuning/main.nf.test @@ -0,0 +1,49 @@ +nextflow_pipeline { + + name "groHMM" + script "../../../main.nf" + tag "groHMM" + + test("Should be able to skip tuning") { + + when { + params { + outdir = "$outputDir" + skip_grohmm = false + skip_tuning = true + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(UTILS.removeNextflowVersion("$outputDir/pipeline_info/nf_core_pipeline_software_mqc_versions.yml")).match("software_versions") }, + { assert snapshot( + workflow.trace.tasks().size(), + path("$outputDir/transcript_identification/homer/cd4.bed"), + path("$outputDir/transcript_identification/homer/jurkat.bed"), + // FIXME Not determinstic because of the order of files + // Add to the other tests when fixed + // UTILS.getAllFilesFromDir("$outputDir/transcript_identification/pints/", ".bed"), + path("$outputDir/transcript_identification/intersect/").list(), + path("$outputDir/transcript_identification/filtered/").list(), + path("$outputDir/transcript_identification/grohmm/cd4.eval.txt"), + path("$outputDir/transcript_identification/grohmm/cd4.final.transcripts.bed"), + path("$outputDir/transcript_identification/grohmm/cd4.tdFinal.txt"), + path("$outputDir/transcript_identification/grohmm/cd4.tdplot_mqc.jpg").exists(), + path("$outputDir/transcript_identification/grohmm/cd4.transcripts.txt"), + path("$outputDir/transcript_identification/grohmm/jurkat.eval.txt"), + path("$outputDir/transcript_identification/grohmm/jurkat.final.transcripts.bed"), + path("$outputDir/transcript_identification/grohmm/jurkat.tdFinal.txt"), + path("$outputDir/transcript_identification/grohmm/jurkat.tdplot_mqc.jpg").exists(), + path("$outputDir/transcript_identification/grohmm/jurkat.transcripts.txt"), + // FIXME Not determinstic because of the order of files + // Add to the other tests when fixed + // path("$outputDir/quantification/").list(), + path("$outputDir/multiqc/multiqc_report.html").exists(), + ).match("output_files") + } + ) + } + } +} diff --git a/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test b/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test new file mode 100644 index 00000000..09053601 --- /dev/null +++ b/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test @@ -0,0 +1,48 @@ +nextflow_pipeline { + name "groHMM" + script "../../../main.nf" + tag "groHMM" + + test("Should run with defaults") { + when { + params { + outdir = "$outputDir" + skip_grohmm = false + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(UTILS.removeNextflowVersion("$outputDir/pipeline_info/nf_core_pipeline_software_mqc_versions.yml")).match("software_versions") }, + { assert snapshot( + workflow.trace.tasks().size(), + path("$outputDir/transcript_identification/homer/cd4.bed"), + path("$outputDir/transcript_identification/homer/jurkat.bed"), + // FIXME Not determinstic because of the order of files + // Add to the other tests when fixed + // UTILS.getAllFilesFromDir("$outputDir/transcript_identification/pints/", ".bed"), + path("$outputDir/transcript_identification/intersect/").list(), + path("$outputDir/transcript_identification/filtered/").list(), + path("$outputDir/transcript_identification/intersect/").list(), + path("$outputDir/transcript_identification/filtered/").list(), + path("$outputDir/transcript_identification/grohmm/cd4.eval.txt"), + path("$outputDir/transcript_identification/grohmm/cd4.final.transcripts.bed"), + path("$outputDir/transcript_identification/grohmm/cd4.tdFinal.txt"), + path("$outputDir/transcript_identification/grohmm/cd4.tdplot_mqc.jpg").exists(), + path("$outputDir/transcript_identification/grohmm/cd4.transcripts.txt"), + path("$outputDir/transcript_identification/grohmm/jurkat.eval.txt"), + path("$outputDir/transcript_identification/grohmm/jurkat.final.transcripts.bed"), + path("$outputDir/transcript_identification/grohmm/jurkat.tdFinal.txt"), + path("$outputDir/transcript_identification/grohmm/jurkat.tdplot_mqc.jpg").exists(), + path("$outputDir/transcript_identification/grohmm/jurkat.transcripts.txt"), + // FIXME Not determinstic because of the order of files + // Add to the other tests when fixed + // path("$outputDir/quantification/").list(), + path("$outputDir/multiqc/multiqc_report.html").exists(), + ).match("output_files") + } + ) + } + } +} From 21ba6f7f291cec0d88e6cd88fbd2e1f61b8e9f2b Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Wed, 11 Sep 2024 17:50:02 -0500 Subject: [PATCH 03/54] fix(grohmm): bam => bams --- modules/local/grohmm/parametertuning/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/grohmm/parametertuning/main.nf b/modules/local/grohmm/parametertuning/main.nf index b6785e4d..14bceae3 100644 --- a/modules/local/grohmm/parametertuning/main.nf +++ b/modules/local/grohmm/parametertuning/main.nf @@ -25,7 +25,7 @@ process GROHMM_PARAMETERTUNING { def prefix = task.ext.prefix ?: "${meta.id}" """ parameter_tuning.R \\ - --bam_file ${bam} \\ + --bam_file ${bams} \\ --tuning_file ${tune_parameter_file} \\ --outprefix ${prefix} \\ --gtf $gtf \\ From 39265aa1790292acc4bbd4c0255b8c1b910e73a7 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Thu, 12 Sep 2024 12:36:40 -0500 Subject: [PATCH 04/54] fix(grohmm): Link up tuning files and samples --- modules/local/grohmm/parametertuning/main.nf | 2 +- modules/local/grohmm/transcriptcalling/main.nf | 3 +-- subworkflows/local/grohmm/main.nf | 8 +++++--- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/modules/local/grohmm/parametertuning/main.nf b/modules/local/grohmm/parametertuning/main.nf index 14bceae3..3f2f0f50 100644 --- a/modules/local/grohmm/parametertuning/main.nf +++ b/modules/local/grohmm/parametertuning/main.nf @@ -14,7 +14,7 @@ process GROHMM_PARAMETERTUNING { path tune_parameter_file output: - path "*.tuning.csv" , emit: tuning + tuple val(meta), path("*.tuning.csv"), emit: tuning path "versions.yml", emit: versions when: diff --git a/modules/local/grohmm/transcriptcalling/main.nf b/modules/local/grohmm/transcriptcalling/main.nf index e93c84bb..a21b9022 100644 --- a/modules/local/grohmm/transcriptcalling/main.nf +++ b/modules/local/grohmm/transcriptcalling/main.nf @@ -9,9 +9,8 @@ process GROHMM_TRANSCRIPTCALLING { 'quay.io/biocontainers/mulled-v2-e9a6cb7894dd2753aff7d9446ea95c962cce8c46:0a46dae3241b1c4f02e46468f5d54eadcf64beca-0' }" input: - tuple val(meta), path(bams), path(bais) + tuple val(meta), path(bams), path(bais), path(tuning_file) path gtf - path tuning_file output: tuple val(meta), path("*.transcripts.txt"), emit: transcripts diff --git a/subworkflows/local/grohmm/main.nf b/subworkflows/local/grohmm/main.nf index 0c6d04ba..7a488a81 100644 --- a/subworkflows/local/grohmm/main.nf +++ b/subworkflows/local/grohmm/main.nf @@ -18,7 +18,7 @@ workflow GROHMM { ch_versions = Channel.empty() - ch_tuning = [] + ch_tuning = Channel.empty() if(!params.skip_tuning) { GROHMM_PARAMETERTUNING ( @@ -27,13 +27,15 @@ workflow GROHMM { tuning_file ) ch_tuning = GROHMM_PARAMETERTUNING.out.tuning + ch_bams_bais_tuning = bams_bais.join(ch_tuning) ch_versions = ch_versions.mix(GROHMM_PARAMETERTUNING.out.versions.first()) + } else { + ch_bams_bais_tuning = bams_bais.join(ch_tuning) } GROHMM_TRANSCRIPTCALLING ( - bams_bais, + [ch_bams_bais_tuning, []], gtf, - ch_tuning ) ch_versions = ch_versions.mix(GROHMM_TRANSCRIPTCALLING.out.versions.first()) From f431c7389ca297555e31ffef7998221e7cf1611e Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Fri, 13 Sep 2024 13:42:36 -0500 Subject: [PATCH 05/54] refactor(grohmm): Add notes of brainstorming --- .../local/grohmm/transcriptcalling/main.nf | 4 ++- subworkflows/local/grohmm/main.nf | 32 ++++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/modules/local/grohmm/transcriptcalling/main.nf b/modules/local/grohmm/transcriptcalling/main.nf index a21b9022..62947b17 100644 --- a/modules/local/grohmm/transcriptcalling/main.nf +++ b/modules/local/grohmm/transcriptcalling/main.nf @@ -9,8 +9,10 @@ process GROHMM_TRANSCRIPTCALLING { 'quay.io/biocontainers/mulled-v2-e9a6cb7894dd2753aff7d9446ea95c962cce8c46:0a46dae3241b1c4f02e46468f5d54eadcf64beca-0' }" input: - tuple val(meta), path(bams), path(bais), path(tuning_file) + tuple val(meta), path(bams), path(bais) path gtf + val UTS + val LtProbB output: tuple val(meta), path("*.transcripts.txt"), emit: transcripts diff --git a/subworkflows/local/grohmm/main.nf b/subworkflows/local/grohmm/main.nf index 7a488a81..ca7f4b13 100644 --- a/subworkflows/local/grohmm/main.nf +++ b/subworkflows/local/grohmm/main.nf @@ -3,7 +3,6 @@ */ include { GROHMM_TRANSCRIPTCALLING } from '../../../modules/local/grohmm/transcriptcalling/main.nf' -include { GROHMM_PARAMETERTUNING } from '../../../modules/local/grohmm/parametertuning/main.nf' /* * Note meta refers to all merged files @@ -20,6 +19,37 @@ workflow GROHMM { ch_tuning = Channel.empty() + // If a tuning file is provided, run transcriptcalling once + if(tuning_file) { + // TODO Find minimum + // uts <- tune[which.min(tune$errorRate), "UTS"] + // lt_probb <- tune[which.min(tune$errorRate), "LtProbB"] + GROHMM_TRANSCRIPTCALLING( + bam_bais, + gtf, + minimum_uts, + minimum_ltprobb, + ) + } else { + // Run transcriptcalling eval for each tuning param + // Should avoid a tuning file with a row for everything + // 5..45 by 5 for UTS is what we had currently + // -100..-400 by 50 for LtProbB + GROHMM_PARAMETERTUNING ( + bam_bais, + gtf, + ch_uts, + ch_ltprobb, + ) + // TODO CollectFile the tuning + // TODO Find the minimum values + // TODO Need to decide if windowAnalysis is important + // https://github.com/Functional-Genomics-Lab/groseq-analysis/blob/9b69519c41232fd653a2b2726e32d91b49abeb7e/research/groHMM2.R#L62C7-L62C21 + // If it is need to rerun transcriptcalling without it + } + + + if(!params.skip_tuning) { GROHMM_PARAMETERTUNING ( bams_bais, From 727fdd10cfa6f1401993d1f531a659f8c2130f1a Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Sat, 14 Sep 2024 15:58:33 -0500 Subject: [PATCH 06/54] feat: Add grohmm min and max tuning params --- conf/test.config | 7 ++- .../local/grohmm/transcriptcalling/main.nf | 4 +- nextflow.config | 6 ++- nextflow_schema.json | 29 ++++++++++- subworkflows/local/grohmm/main.nf | 49 ++++++------------- 5 files changed, 57 insertions(+), 38 deletions(-) diff --git a/conf/test.config b/conf/test.config index dd68aa6a..7f46f87a 100644 --- a/conf/test.config +++ b/conf/test.config @@ -33,7 +33,12 @@ params { assay_type = "GROseq" skip_grohmm = true // FIXME Fails due to higher memory requirements - tuning_file = "${projectDir}/tests/config/tuningparams_small.csv" + // tuning_file = "${projectDir}/tests/config/tuningparams_small.csv" + tuning_file = null + grohmm_min_uts = 5 + grohmm_max_uts = 10 + grohmm_min_ltprobb = -100 + grohmm_max_ltprobb = -150 filter_bed = "${projectDir}/tests/config/unwanted_region.bed" intersect_bed = "${projectDir}/tests/config/wanted_region.bed" } diff --git a/modules/local/grohmm/transcriptcalling/main.nf b/modules/local/grohmm/transcriptcalling/main.nf index 62947b17..f65d1c2f 100644 --- a/modules/local/grohmm/transcriptcalling/main.nf +++ b/modules/local/grohmm/transcriptcalling/main.nf @@ -28,13 +28,13 @@ process GROHMM_TRANSCRIPTCALLING { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def tuning = tuning_file ? "--tuning_file ${tuning_file}" : "" """ transcriptcalling_grohmm.R \\ --bam_file ${bams} \\ - $tuning \\ --outprefix ${prefix} \\ --gtf $gtf \\ + --uts $UTS \\ + --ltprobb $LtProbB \\ --outdir ./ \\ --cores $task.cpus \\ $args diff --git a/nextflow.config b/nextflow.config index 76290558..e1cfc965 100644 --- a/nextflow.config +++ b/nextflow.config @@ -25,8 +25,12 @@ params { // Transcript identification method assay_type = null skip_grohmm = false - skip_tuning = false tuning_file = null + grohmm_min_uts = 5 + grohmm_max_uts = 45 + // Depends on how you look at this one... But I figured most will ignore the negative + grohmm_min_ltprobb = -100 + grohmm_max_ltprobb = -400 filter_bed = null intersect_bed = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 735fa18e..02574536 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -126,7 +126,34 @@ "skip_grohmm": { "type": "boolean", "description": "Skip groHMM all together", - "fa_icon": "fas fa-eye-slash" + "fa_icon": "fas fa-eye-slash", + "default": false + }, + "grohmm_min_uts": { + "type": "integer", + "description": "Minimum number of UTs to use for groHMM.", + "fa_icon": "fas fa-sort-amount-desc", + "default": 5 + }, + "grohmm_max_uts": { + "type": "integer", + "description": "Maximum number of UTs to use for groHMM.", + "fa_icon": "fas fa-sort-amount-desc", + "default": 45 + }, + "grohmm_min_ltprobb": { + "type": "integer", + "description": "Minimum LTProbB value to use for groHMM.", + "fa_icon": "fas fa-sort-amount-desc", + "help_text": "Depends on how you look at this one, which is the minimum and maximum... But I figured most will ignore the negative, so we went with absolute values.", + "default": -100 + }, + "grohmm_max_ltprobb": { + "type": "integer", + "description": "Maximum LTProbB value to use for groHMM.", + "fa_icon": "fas fa-sort-amount-desc", + "help_text": "Depends on how you look at this one, which is the minimum and maximum... But I figured most will ignore the negative, so we went with absolute values.", + "default": -400 }, "tuning_file": { "type": "string", diff --git a/subworkflows/local/grohmm/main.nf b/subworkflows/local/grohmm/main.nf index ca7f4b13..88d04a1e 100644 --- a/subworkflows/local/grohmm/main.nf +++ b/subworkflows/local/grohmm/main.nf @@ -20,23 +20,25 @@ workflow GROHMM { ch_tuning = Channel.empty() // If a tuning file is provided, run transcriptcalling once - if(tuning_file) { - // TODO Find minimum - // uts <- tune[which.min(tune$errorRate), "UTS"] - // lt_probb <- tune[which.min(tune$errorRate), "LtProbB"] - GROHMM_TRANSCRIPTCALLING( - bam_bais, - gtf, - minimum_uts, - minimum_ltprobb, - ) - } else { + // if(tuning_file) { + // TODO Find minimum + // uts <- tune[which.min(tune$errorRate), "UTS"] + // lt_probb <- tune[which.min(tune$errorRate), "LtProbB"] + // GROHMM_TRANSCRIPTCALLING ( + // bams_bais, + // gtf, + // minimum_uts, + // minimum_ltprobb, + // ) + // } else { // Run transcriptcalling eval for each tuning param // Should avoid a tuning file with a row for everything // 5..45 by 5 for UTS is what we had currently + ch_uts = channel.of((params.grohmm_min_uts..params.grohmm_max_uts).step(5)) // -100..-400 by 50 for LtProbB - GROHMM_PARAMETERTUNING ( - bam_bais, + ch_ltprobb = channel.of((params.grohmm_min_ltprobb..params.grohmm_max_ltprobb).step(50)).view() + GROHMM_TRANSCRIPTCALLING ( + bams_bais, gtf, ch_uts, ch_ltprobb, @@ -46,27 +48,8 @@ workflow GROHMM { // TODO Need to decide if windowAnalysis is important // https://github.com/Functional-Genomics-Lab/groseq-analysis/blob/9b69519c41232fd653a2b2726e32d91b49abeb7e/research/groHMM2.R#L62C7-L62C21 // If it is need to rerun transcriptcalling without it - } - - - - if(!params.skip_tuning) { - GROHMM_PARAMETERTUNING ( - bams_bais, - gtf, - tuning_file - ) - ch_tuning = GROHMM_PARAMETERTUNING.out.tuning - ch_bams_bais_tuning = bams_bais.join(ch_tuning) - ch_versions = ch_versions.mix(GROHMM_PARAMETERTUNING.out.versions.first()) - } else { - ch_bams_bais_tuning = bams_bais.join(ch_tuning) - } + // } - GROHMM_TRANSCRIPTCALLING ( - [ch_bams_bais_tuning, []], - gtf, - ) ch_versions = ch_versions.mix(GROHMM_TRANSCRIPTCALLING.out.versions.first()) emit: From bb5948856eba51c21eb74cbccab54a46d61d21b6 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Sun, 15 Sep 2024 14:10:16 -0500 Subject: [PATCH 07/54] fix: of => fromList --- subworkflows/local/grohmm/main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/grohmm/main.nf b/subworkflows/local/grohmm/main.nf index 88d04a1e..f8ae0c16 100644 --- a/subworkflows/local/grohmm/main.nf +++ b/subworkflows/local/grohmm/main.nf @@ -34,9 +34,9 @@ workflow GROHMM { // Run transcriptcalling eval for each tuning param // Should avoid a tuning file with a row for everything // 5..45 by 5 for UTS is what we had currently - ch_uts = channel.of((params.grohmm_min_uts..params.grohmm_max_uts).step(5)) + ch_uts = channel.fromList((params.grohmm_min_uts..params.grohmm_max_uts).step(5)) // -100..-400 by 50 for LtProbB - ch_ltprobb = channel.of((params.grohmm_min_ltprobb..params.grohmm_max_ltprobb).step(50)).view() + ch_ltprobb = channel.fromList((params.grohmm_min_ltprobb..params.grohmm_max_ltprobb).step(50)).view() GROHMM_TRANSCRIPTCALLING ( bams_bais, gtf, From ab20315d3b8d579eff8315641d00dbc5235d2443 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Sun, 15 Sep 2024 16:30:31 -0500 Subject: [PATCH 08/54] refactor(grohmm): Use parameter tuning split --- bin/parameter_tuning.R | 55 ++++++++------------ modules/local/grohmm/parametertuning/main.nf | 10 ++-- subworkflows/local/grohmm/main.nf | 9 +++- 3 files changed, 35 insertions(+), 39 deletions(-) diff --git a/bin/parameter_tuning.R b/bin/parameter_tuning.R index 251cab70..d2ecf910 100755 --- a/bin/parameter_tuning.R +++ b/bin/parameter_tuning.R @@ -88,19 +88,9 @@ if (is.null(args$bam_files)) { print_help(args) stop("Please provide a bam file", call. = FALSE) } -if (is.null(args$tuning_file)) { - print_help(args) - stop("Please provide a tuning file", call. = FALSE) -} - - -# Read in bam file. -if (file.exists(args$outdir) == FALSE) { - dir.create(args$outdir, recursive = TRUE) -} -setwd(args$outdir) -# CHANGE BASED ON PAIRED OR SINGLE END +# Load alignment files +# TODO? CHANGE BASED ON PAIRED OR SINGLE END alignments <- c() for (bam in args$bam_files) { alignments <- append( @@ -121,32 +111,33 @@ kg_consensus <- makeConsensusAnnotations( ) print("Finished consensus annotations") -# TUNING -tune <- read.csv(args$tuning_file) - -evals <- mclapply(seq_len(nrow(tune)), function(x) { - hmm <- detectTranscripts( - reads = alignments, - LtProbB = tune$LtProbB[x], UTS = tune$UTS[x] - ) - e <- evaluateHMMInAnnotations(hmm$transcripts, kg_consensus) - e$eval -}, mc.cores = args$cores, mc.silent = TRUE) - -tune <- cbind(tune, do.call(rbind, evals)) -write.csv(tune, file = paste0(args$outprefix, ".tuning.csv")) +############ +## TUNING ## +############ +print("Starting tuning run") +hmm <- detectTranscripts( + reads = alignments, + LtProbB = args$ltprobb, + UTS = args$uts +) +print("Evaluating") +e <- evaluateHMMInAnnotations(hmm$transcripts, kg_consensus) +print(e) +print(e$eval) +write.csv(e$eval, file = paste0(args$outprefix, ".tuning.csv")) -# CITE PACKAGES USED +######################## +## CITE PACKAGES USED ## +######################## citation("groHMM") citation("GenomicFeatures") citation("GenomicAlignments") citation("AnnotationDbi") -## R SESSION INFO ## -################################################ -################################################ - +#################### +## R SESSION INFO ## +#################### r_log_file <- "R_sessionInfo.log" if (file.exists(r_log_file) == FALSE) { sink(r_log_file) @@ -154,5 +145,3 @@ if (file.exists(r_log_file) == FALSE) { print(a) sink() } - -################################################################################ diff --git a/modules/local/grohmm/parametertuning/main.nf b/modules/local/grohmm/parametertuning/main.nf index 3f2f0f50..c882ecd3 100644 --- a/modules/local/grohmm/parametertuning/main.nf +++ b/modules/local/grohmm/parametertuning/main.nf @@ -1,5 +1,5 @@ process GROHMM_PARAMETERTUNING { - tag "$meta.id" + tag "$meta.id|$UTS|$LtProbB" label 'process_high' label 'process_long' @@ -11,7 +11,8 @@ process GROHMM_PARAMETERTUNING { input: tuple val(meta), path(bams), path(bais) path gtf - path tune_parameter_file + val UTS + val LtProbB output: tuple val(meta), path("*.tuning.csv"), emit: tuning @@ -22,13 +23,14 @@ process GROHMM_PARAMETERTUNING { script: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" + def prefix = task.ext.prefix ?: "${meta.id}_${UTS}_${LtProbB}" """ parameter_tuning.R \\ --bam_file ${bams} \\ - --tuning_file ${tune_parameter_file} \\ --outprefix ${prefix} \\ --gtf $gtf \\ + --uts $UTS \\ + --ltprobb $LtProbB \\ --outdir ./ \\ --cores $task.cpus \\ $args diff --git a/subworkflows/local/grohmm/main.nf b/subworkflows/local/grohmm/main.nf index f8ae0c16..2b417ef9 100644 --- a/subworkflows/local/grohmm/main.nf +++ b/subworkflows/local/grohmm/main.nf @@ -37,13 +37,18 @@ workflow GROHMM { ch_uts = channel.fromList((params.grohmm_min_uts..params.grohmm_max_uts).step(5)) // -100..-400 by 50 for LtProbB ch_ltprobb = channel.fromList((params.grohmm_min_ltprobb..params.grohmm_max_ltprobb).step(50)).view() - GROHMM_TRANSCRIPTCALLING ( + GROHMM_PARAMETERTUNING ( bams_bais, gtf, ch_uts, ch_ltprobb, ) - // TODO CollectFile the tuning + .tuning + .collectFile( + name: "${params.outdir}/transcript_identification/grohmm/${item[0].id}_tuning.csv", + keepHeader: true, + skip: 1, + ) // TODO Find the minimum values // TODO Need to decide if windowAnalysis is important // https://github.com/Functional-Genomics-Lab/groseq-analysis/blob/9b69519c41232fd653a2b2726e32d91b49abeb7e/research/groHMM2.R#L62C7-L62C21 From 6e0b9a8b81f9fcd26e0fd7c2a1a1b8276389b0c2 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Sun, 15 Sep 2024 16:34:00 -0500 Subject: [PATCH 09/54] fix(grohmm): Use windowAnalysis Apparently we weren't doing this. It's in the guidelines and in Shayne's scripts --- bin/parameter_tuning.R | 6 +++++- subworkflows/local/grohmm/main.nf | 3 --- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/bin/parameter_tuning.R b/bin/parameter_tuning.R index d2ecf910..8c513a6a 100755 --- a/bin/parameter_tuning.R +++ b/bin/parameter_tuning.R @@ -115,16 +115,20 @@ print("Finished consensus annotations") ## TUNING ## ############ print("Starting tuning run") +Fp <- windowAnalysis(alignments, strand = "+", windowSize = 50) +Fm <- windowAnalysis(alignments, strand = "-", windowSize = 50) hmm <- detectTranscripts( + Fp = Fp, + Fm = Fm, reads = alignments, LtProbB = args$ltprobb, UTS = args$uts ) print("Evaluating") e <- evaluateHMMInAnnotations(hmm$transcripts, kg_consensus) +print(e$eval) print(e) -print(e$eval) write.csv(e$eval, file = paste0(args$outprefix, ".tuning.csv")) ######################## diff --git a/subworkflows/local/grohmm/main.nf b/subworkflows/local/grohmm/main.nf index 2b417ef9..8466c885 100644 --- a/subworkflows/local/grohmm/main.nf +++ b/subworkflows/local/grohmm/main.nf @@ -50,9 +50,6 @@ workflow GROHMM { skip: 1, ) // TODO Find the minimum values - // TODO Need to decide if windowAnalysis is important - // https://github.com/Functional-Genomics-Lab/groseq-analysis/blob/9b69519c41232fd653a2b2726e32d91b49abeb7e/research/groHMM2.R#L62C7-L62C21 - // If it is need to rerun transcriptcalling without it // } ch_versions = ch_versions.mix(GROHMM_TRANSCRIPTCALLING.out.versions.first()) From 114136cc558255b9a4e46a9e516b6ee748f013a0 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Sun, 15 Sep 2024 17:02:27 -0500 Subject: [PATCH 10/54] refactor: Setup transcript calling --- bin/transcriptcalling_grohmm.R | 25 +++++------ .../local/grohmm/transcriptcalling/main.nf | 7 +--- subworkflows/local/grohmm/main.nf | 42 ++++++++++--------- 3 files changed, 37 insertions(+), 37 deletions(-) diff --git a/bin/transcriptcalling_grohmm.R b/bin/transcriptcalling_grohmm.R index 3c99c8bd..d6995b52 100755 --- a/bin/transcriptcalling_grohmm.R +++ b/bin/transcriptcalling_grohmm.R @@ -96,13 +96,15 @@ for (bam in args$bam_files) { # Call annotations > DEFAULT VALUES ASSIGNED if (is.null(args$tuning_file)) { + # Use user supplied values or defaults hmm_result <- detectTranscripts( alignments, LtProbB = args$ltprobb, UTS = args$uts, threshold = 1 - ) # Uses either inputted or default values + ) } else { + # Use tuning file and take the minimum values tune <- read.csv(args$tuning_file) # Minimum error uts <- tune[which.min(tune$errorRate), "UTS"] @@ -115,6 +117,8 @@ if (is.null(args$tuning_file)) { ) } +# NOTE TUNING IN A DIFFERENT SCRIPT + tx_hmm <- hmm_result$transcripts write.table( tx_hmm, @@ -140,9 +144,8 @@ e <- evaluateHMMInAnnotations(tx_hmm, kg_consensus) # Save as txt file capture.output(e$eval, file = paste0(args$outprefix, ".eval.txt")) -# TUNING IN A DIFFERENT SCRIPT -# repairing with annotations +print("repairing with annotations") get_expressed_annotations <- function(features, reads) { f_limit <- limitToXkb(features) count <- countOverlaps(f_limit, reads) @@ -164,25 +167,25 @@ export( con = paste(args$outprefix, ".final.transcripts.bed", sep = "") ) capture.output(td_final, file = paste0(args$outprefix, ".tdFinal.txt")) -# Output plot +# 1. Output plot jpeg(file = paste0(args$outprefix, ".tdplot_mqc.jpg")) # 2. Create the plot td_final <- getTxDensity(tx_final, con_expressed, mc.cores = args$cores) - # 3. Close the file dev.off() -# CITE PACKAGES USED +######################## +## CITE PACKAGES USED ## +######################## citation("groHMM") citation("GenomicFeatures") citation("GenomicAlignments") citation("AnnotationDbi") -## R SESSION INFO ## -################################################ -################################################ - +#################### +## R SESSION INFO ## +#################### r_log_file <- "R_sessionInfo.log" if (file.exists(r_log_file) == FALSE) { sink(r_log_file) @@ -190,5 +193,3 @@ if (file.exists(r_log_file) == FALSE) { print(a) sink() } - -################################################################################ diff --git a/modules/local/grohmm/transcriptcalling/main.nf b/modules/local/grohmm/transcriptcalling/main.nf index f65d1c2f..d961f860 100644 --- a/modules/local/grohmm/transcriptcalling/main.nf +++ b/modules/local/grohmm/transcriptcalling/main.nf @@ -9,10 +9,8 @@ process GROHMM_TRANSCRIPTCALLING { 'quay.io/biocontainers/mulled-v2-e9a6cb7894dd2753aff7d9446ea95c962cce8c46:0a46dae3241b1c4f02e46468f5d54eadcf64beca-0' }" input: - tuple val(meta), path(bams), path(bais) + tuple val(meta), path(bams), path(bais), path(tuning) path gtf - val UTS - val LtProbB output: tuple val(meta), path("*.transcripts.txt"), emit: transcripts @@ -31,10 +29,9 @@ process GROHMM_TRANSCRIPTCALLING { """ transcriptcalling_grohmm.R \\ --bam_file ${bams} \\ + --tuning_file ${tuning_file} \\ --outprefix ${prefix} \\ --gtf $gtf \\ - --uts $UTS \\ - --ltprobb $LtProbB \\ --outdir ./ \\ --cores $task.cpus \\ $args diff --git a/subworkflows/local/grohmm/main.nf b/subworkflows/local/grohmm/main.nf index 8466c885..337a5ef3 100644 --- a/subworkflows/local/grohmm/main.nf +++ b/subworkflows/local/grohmm/main.nf @@ -19,18 +19,7 @@ workflow GROHMM { ch_tuning = Channel.empty() - // If a tuning file is provided, run transcriptcalling once - // if(tuning_file) { - // TODO Find minimum - // uts <- tune[which.min(tune$errorRate), "UTS"] - // lt_probb <- tune[which.min(tune$errorRate), "LtProbB"] - // GROHMM_TRANSCRIPTCALLING ( - // bams_bais, - // gtf, - // minimum_uts, - // minimum_ltprobb, - // ) - // } else { + if(!tuning_file) { // Run transcriptcalling eval for each tuning param // Should avoid a tuning file with a row for everything // 5..45 by 5 for UTS is what we had currently @@ -43,15 +32,28 @@ workflow GROHMM { ch_uts, ch_ltprobb, ) - .tuning - .collectFile( - name: "${params.outdir}/transcript_identification/grohmm/${item[0].id}_tuning.csv", - keepHeader: true, - skip: 1, - ) - // TODO Find the minimum values - // } + .tuning + .collectFile( + name: "${params.outdir}/transcript_identification/grohmm/${item[0].id}_tuning.csv", + keepHeader: true, + skip: 1, + newLine: true, + ) + .set { ch_tuning } + + ch_bams_bais_tuning = bams_bais.join(ch_tuning, by: [0]) + + ch_versions = ch_versions.mix(GROHMM_PARAMETERTUNING.out.versions.first()) + } else { + // If a tuning file is provided, run transcriptcalling once + // NOTE This doesn't really handle multiple "groups well" + ch_bams_bais_tuning = bams_bais.join(tuning_file) + } + GROHMM_TRANSCRIPTCALLING ( + ch_bams_bais_tuning, + gtf, + ) ch_versions = ch_versions.mix(GROHMM_TRANSCRIPTCALLING.out.versions.first()) emit: From d72402b5522cf16e91321456aba5680f12ab5ce0 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Wed, 18 Sep 2024 18:18:01 -0500 Subject: [PATCH 11/54] build: Build a seperate grohmm conda package --- .../local/grohmm/parametertuning/environment.yml | 14 ++++++++++++++ modules/local/grohmm/parametertuning/main.nf | 8 ++++---- .../local/grohmm/transcriptcalling/environment.yml | 14 ++++++++++++++ modules/local/grohmm/transcriptcalling/main.nf | 8 ++++---- 4 files changed, 36 insertions(+), 8 deletions(-) create mode 100644 modules/local/grohmm/parametertuning/environment.yml create mode 100644 modules/local/grohmm/transcriptcalling/environment.yml diff --git a/modules/local/grohmm/parametertuning/environment.yml b/modules/local/grohmm/parametertuning/environment.yml new file mode 100644 index 00000000..e9f7efa5 --- /dev/null +++ b/modules/local/grohmm/parametertuning/environment.yml @@ -0,0 +1,14 @@ +name: grohmm +channels: + - https://repo.prefix.dev/bioinformaticsorphanage + - conda-forge + - bioconda +dependencies: + - conda-forge::r-base + - conda-forge::r-optparse + - conda-forge::r-argparse + - bioconda::bioconductor-genomicfeatures + # NOTE Can't use the official package because of + # https://github.com/dankoc/groHMM/issues/2 + # - bioconda::bioconductor-grohmm=1.28.0 + - bioinfoorphanage-grohmm diff --git a/modules/local/grohmm/parametertuning/main.nf b/modules/local/grohmm/parametertuning/main.nf index c882ecd3..8579c11e 100644 --- a/modules/local/grohmm/parametertuning/main.nf +++ b/modules/local/grohmm/parametertuning/main.nf @@ -3,10 +3,10 @@ process GROHMM_PARAMETERTUNING { label 'process_high' label 'process_long' - conda "conda-forge::r-base=4.1.1 conda-forge::r-optparse=1.7.1 conda-forge::r-argparse=2.1.3 bioconda::bioconductor-genomicfeatures=1.46.1 bioconda::bioconductor-grohmm=1.28.0" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-e9a6cb7894dd2753aff7d9446ea95c962cce8c46:0a46dae3241b1c4f02e46468f5d54eadcf64beca-0' : - 'quay.io/biocontainers/mulled-v2-e9a6cb7894dd2753aff7d9446ea95c962cce8c46:0a46dae3241b1c4f02e46468f5d54eadcf64beca-0' }" + conda "${moduleDir}/environment.yml" + // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + // 'https://depot.galaxyproject.org/singularity/mulled-v2-e9a6cb7894dd2753aff7d9446ea95c962cce8c46:0a46dae3241b1c4f02e46468f5d54eadcf64beca-0' : + // 'quay.io/biocontainers/mulled-v2-e9a6cb7894dd2753aff7d9446ea95c962cce8c46:0a46dae3241b1c4f02e46468f5d54eadcf64beca-0' }" input: tuple val(meta), path(bams), path(bais) diff --git a/modules/local/grohmm/transcriptcalling/environment.yml b/modules/local/grohmm/transcriptcalling/environment.yml new file mode 100644 index 00000000..e9f7efa5 --- /dev/null +++ b/modules/local/grohmm/transcriptcalling/environment.yml @@ -0,0 +1,14 @@ +name: grohmm +channels: + - https://repo.prefix.dev/bioinformaticsorphanage + - conda-forge + - bioconda +dependencies: + - conda-forge::r-base + - conda-forge::r-optparse + - conda-forge::r-argparse + - bioconda::bioconductor-genomicfeatures + # NOTE Can't use the official package because of + # https://github.com/dankoc/groHMM/issues/2 + # - bioconda::bioconductor-grohmm=1.28.0 + - bioinfoorphanage-grohmm diff --git a/modules/local/grohmm/transcriptcalling/main.nf b/modules/local/grohmm/transcriptcalling/main.nf index d961f860..527e8667 100644 --- a/modules/local/grohmm/transcriptcalling/main.nf +++ b/modules/local/grohmm/transcriptcalling/main.nf @@ -3,10 +3,10 @@ process GROHMM_TRANSCRIPTCALLING { label 'process_high' label 'process_long' - conda "conda-forge::r-base=4.1.1 conda-forge::r-optparse=1.7.1 conda-forge::r-argparse=2.1.3 bioconda::bioconductor-genomicfeatures=1.46.1 bioconda::bioconductor-grohmm=1.28.0" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-e9a6cb7894dd2753aff7d9446ea95c962cce8c46:0a46dae3241b1c4f02e46468f5d54eadcf64beca-0' : - 'quay.io/biocontainers/mulled-v2-e9a6cb7894dd2753aff7d9446ea95c962cce8c46:0a46dae3241b1c4f02e46468f5d54eadcf64beca-0' }" + conda "${moduleDir}/environment.yml" + // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + // 'https://depot.galaxyproject.org/singularity/mulled-v2-e9a6cb7894dd2753aff7d9446ea95c962cce8c46:0a46dae3241b1c4f02e46468f5d54eadcf64beca-0' : + // 'quay.io/biocontainers/mulled-v2-e9a6cb7894dd2753aff7d9446ea95c962cce8c46:0a46dae3241b1c4f02e46468f5d54eadcf64beca-0' }" input: tuple val(meta), path(bams), path(bais), path(tuning) From ce1c51255019362f1b1127c8709036ce13aa95ff Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Wed, 18 Sep 2024 18:33:32 -0500 Subject: [PATCH 12/54] chore: Add working --- subworkflows/local/grohmm/main.nf | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/subworkflows/local/grohmm/main.nf b/subworkflows/local/grohmm/main.nf index 337a5ef3..15b867b6 100644 --- a/subworkflows/local/grohmm/main.nf +++ b/subworkflows/local/grohmm/main.nf @@ -2,6 +2,7 @@ * Run parametertuning optionally, otherwise just run transcript calling */ +include { GROHMM_PARAMETERTUNING } from '../../../modules/local/grohmm/parametertuning/main.nf' include { GROHMM_TRANSCRIPTCALLING } from '../../../modules/local/grohmm/transcriptcalling/main.nf' /* @@ -19,7 +20,7 @@ workflow GROHMM { ch_tuning = Channel.empty() - if(!tuning_file) { + // if(!tuning_file) { // Run transcriptcalling eval for each tuning param // Should avoid a tuning file with a row for everything // 5..45 by 5 for UTS is what we had currently @@ -34,21 +35,21 @@ workflow GROHMM { ) .tuning .collectFile( - name: "${params.outdir}/transcript_identification/grohmm/${item[0].id}_tuning.csv", + name: "${params.outdir}/transcript_identification/grohmm/tuning.csv", keepHeader: true, skip: 1, newLine: true, ) - .set { ch_tuning } + .set { tuning } - ch_bams_bais_tuning = bams_bais.join(ch_tuning, by: [0]) + ch_bams_bais_tuning = bams_bais.join(tuning, by: [0]) ch_versions = ch_versions.mix(GROHMM_PARAMETERTUNING.out.versions.first()) - } else { + // } else { // If a tuning file is provided, run transcriptcalling once // NOTE This doesn't really handle multiple "groups well" - ch_bams_bais_tuning = bams_bais.join(tuning_file) - } + // ch_bams_bais_tuning = bams_bais.join(tuning_file) + // } GROHMM_TRANSCRIPTCALLING ( ch_bams_bais_tuning, From 6ba858f5ab8941806dd758c8a6ee4d40fa7a2292 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Thu, 19 Sep 2024 10:02:59 -0500 Subject: [PATCH 13/54] build(grohmm): Add Seqera containers --- modules/local/grohmm/parametertuning/environment.yml | 8 ++++---- modules/local/grohmm/parametertuning/main.nf | 6 +++--- modules/local/grohmm/transcriptcalling/main.nf | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/modules/local/grohmm/parametertuning/environment.yml b/modules/local/grohmm/parametertuning/environment.yml index e9f7efa5..bce9c898 100644 --- a/modules/local/grohmm/parametertuning/environment.yml +++ b/modules/local/grohmm/parametertuning/environment.yml @@ -4,10 +4,10 @@ channels: - conda-forge - bioconda dependencies: - - conda-forge::r-base - - conda-forge::r-optparse - - conda-forge::r-argparse - - bioconda::bioconductor-genomicfeatures + - conda-forge::r-base # =4.1.1 + - conda-forge::r-optparse # =1.7.1 + - conda-forge::r-argparse # =2.1.3 + - bioconda::bioconductor-genomicfeatures # =1.46.1 # NOTE Can't use the official package because of # https://github.com/dankoc/groHMM/issues/2 # - bioconda::bioconductor-grohmm=1.28.0 diff --git a/modules/local/grohmm/parametertuning/main.nf b/modules/local/grohmm/parametertuning/main.nf index 8579c11e..c9ac0239 100644 --- a/modules/local/grohmm/parametertuning/main.nf +++ b/modules/local/grohmm/parametertuning/main.nf @@ -4,9 +4,9 @@ process GROHMM_PARAMETERTUNING { label 'process_long' conda "${moduleDir}/environment.yml" - // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - // 'https://depot.galaxyproject.org/singularity/mulled-v2-e9a6cb7894dd2753aff7d9446ea95c962cce8c46:0a46dae3241b1c4f02e46468f5d54eadcf64beca-0' : - // 'quay.io/biocontainers/mulled-v2-e9a6cb7894dd2753aff7d9446ea95c962cce8c46:0a46dae3241b1c4f02e46468f5d54eadcf64beca-0' }" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'oras://community.wave.seqera.io/library/grohmm:a660d9c3942c9b85' : + 'community.wave.seqera.io/library/grohmm:780b8693bdaa87b9' }" input: tuple val(meta), path(bams), path(bais) diff --git a/modules/local/grohmm/transcriptcalling/main.nf b/modules/local/grohmm/transcriptcalling/main.nf index 527e8667..80edc07a 100644 --- a/modules/local/grohmm/transcriptcalling/main.nf +++ b/modules/local/grohmm/transcriptcalling/main.nf @@ -4,9 +4,9 @@ process GROHMM_TRANSCRIPTCALLING { label 'process_long' conda "${moduleDir}/environment.yml" - // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - // 'https://depot.galaxyproject.org/singularity/mulled-v2-e9a6cb7894dd2753aff7d9446ea95c962cce8c46:0a46dae3241b1c4f02e46468f5d54eadcf64beca-0' : - // 'quay.io/biocontainers/mulled-v2-e9a6cb7894dd2753aff7d9446ea95c962cce8c46:0a46dae3241b1c4f02e46468f5d54eadcf64beca-0' }" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'oras://community.wave.seqera.io/library/grohmm:a660d9c3942c9b85' : + 'community.wave.seqera.io/library/grohmm:780b8693bdaa87b9' }" input: tuple val(meta), path(bams), path(bais), path(tuning) From 46646f0e7f2a0abc7d7403507a1e652de5cf3d1a Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Thu, 19 Sep 2024 17:32:32 -0500 Subject: [PATCH 14/54] test(grohmm): Add chr7 gtf --- .../grohmm/parametertuning/tests/main.nf.test | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/modules/local/grohmm/parametertuning/tests/main.nf.test b/modules/local/grohmm/parametertuning/tests/main.nf.test index c80c61b4..da7c5338 100644 --- a/modules/local/grohmm/parametertuning/tests/main.nf.test +++ b/modules/local/grohmm/parametertuning/tests/main.nf.test @@ -4,7 +4,7 @@ nextflow_process { script "modules/local/grohmm/parametertuning/main.nf" process "GROHMM_PARAMETERTUNING" - test("Should run without a tuning file") { + test("Should run test data") { when { params { @@ -13,15 +13,19 @@ nextflow_process { process { """ input[0] = [ - [ id: 'mR1' ], - [ file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/illumina/bam/test.single_end.sorted.bam", checkIfExists: true), ] + [ id: 'Sall' ], + [ file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S0mR1.bam", checkIfExists: true), + file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S40mR1.bam", checkIfExists: true) ], + [] ] - // FIXME This needs to match up with s40 input[1] = file( - "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/genome/genome.gtf", + // "http://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/genes/hg19.refGene.gtf.gz", + // "${projectDir}/hg19.refGene.gtf", + "${projectDir}/modules/local/grohmm/parametertuning/tests/hg19.chr7.refGene.gtf", checkIfExists: true ) - input[2] = [] + input[2] = 30 + input[3] = -100 """ } } From 2ad3b9cc8b2787d748a9f3de1074d6f0aa9808fe Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Sun, 22 Sep 2024 21:14:51 -0500 Subject: [PATCH 15/54] test: Setup tests for grohmm --- .../grohmm/parametertuning/environment.yml | 2 +- modules/local/grohmm/parametertuning/main.nf | 1 + .../grohmm/parametertuning/tests/main.nf.test | 40 ++++++++++++++----- .../local/grohmm/transcriptcalling/main.nf | 2 +- .../transcriptcalling/tests/main.nf.test | 38 ++++++++++++++---- 5 files changed, 64 insertions(+), 19 deletions(-) diff --git a/modules/local/grohmm/parametertuning/environment.yml b/modules/local/grohmm/parametertuning/environment.yml index bce9c898..c77ab0b6 100644 --- a/modules/local/grohmm/parametertuning/environment.yml +++ b/modules/local/grohmm/parametertuning/environment.yml @@ -7,7 +7,7 @@ dependencies: - conda-forge::r-base # =4.1.1 - conda-forge::r-optparse # =1.7.1 - conda-forge::r-argparse # =2.1.3 - - bioconda::bioconductor-genomicfeatures # =1.46.1 + - bioconda::bioconductor-genomicfeatures=1.46.1 # NOTE Can't use the official package because of # https://github.com/dankoc/groHMM/issues/2 # - bioconda::bioconductor-grohmm=1.28.0 diff --git a/modules/local/grohmm/parametertuning/main.nf b/modules/local/grohmm/parametertuning/main.nf index c9ac0239..ba0db7ba 100644 --- a/modules/local/grohmm/parametertuning/main.nf +++ b/modules/local/grohmm/parametertuning/main.nf @@ -1,5 +1,6 @@ process GROHMM_PARAMETERTUNING { tag "$meta.id|$UTS|$LtProbB" + debug true label 'process_high' label 'process_long' diff --git a/modules/local/grohmm/parametertuning/tests/main.nf.test b/modules/local/grohmm/parametertuning/tests/main.nf.test index da7c5338..039c164a 100644 --- a/modules/local/grohmm/parametertuning/tests/main.nf.test +++ b/modules/local/grohmm/parametertuning/tests/main.nf.test @@ -4,8 +4,7 @@ nextflow_process { script "modules/local/grohmm/parametertuning/main.nf" process "GROHMM_PARAMETERTUNING" - test("Should run test data") { - + test("Should fail with a tuning file") { when { params { outdir = "$outputDir" @@ -16,16 +15,13 @@ nextflow_process { [ id: 'Sall' ], [ file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S0mR1.bam", checkIfExists: true), file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S40mR1.bam", checkIfExists: true) ], - [] + [], + [], ] input[1] = file( - // "http://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/genes/hg19.refGene.gtf.gz", - // "${projectDir}/hg19.refGene.gtf", - "${projectDir}/modules/local/grohmm/parametertuning/tests/hg19.chr7.refGene.gtf", + "${projectDir}/modules/local/grohmm/parametertuning/tests/main.nf.test", checkIfExists: true ) - input[2] = 30 - input[3] = -100 """ } } @@ -37,9 +33,33 @@ nextflow_process { { assert snapshot(process.out).match() } ) } - } - // TODO Add a test for with a tuning file + test("Should run with a tuning file and without bai files") { + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id: 'Sall' ], + [ file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S0mR1.bam", checkIfExists: true), + file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S40mR1.bam", checkIfExists: true) ], + [], + file("${projectDir}/tests/config/tuningparams_small.csv", checkIfExists: true), + ] + input[1] = file( + "${projectDir}/modules/local/grohmm/parametertuning/tests/hg19.chr7.refGene.gtf", + checkIfExists: true + ) + """ + } + } + then { + // TODO + assert process.failed + } + } } diff --git a/modules/local/grohmm/transcriptcalling/main.nf b/modules/local/grohmm/transcriptcalling/main.nf index 80edc07a..f7df587f 100644 --- a/modules/local/grohmm/transcriptcalling/main.nf +++ b/modules/local/grohmm/transcriptcalling/main.nf @@ -9,7 +9,7 @@ process GROHMM_TRANSCRIPTCALLING { 'community.wave.seqera.io/library/grohmm:780b8693bdaa87b9' }" input: - tuple val(meta), path(bams), path(bais), path(tuning) + tuple val(meta), path(bams), path(bais), path(tuning_file) path gtf output: diff --git a/modules/local/grohmm/transcriptcalling/tests/main.nf.test b/modules/local/grohmm/transcriptcalling/tests/main.nf.test index 6b088542..fc7d6e50 100644 --- a/modules/local/grohmm/transcriptcalling/tests/main.nf.test +++ b/modules/local/grohmm/transcriptcalling/tests/main.nf.test @@ -4,8 +4,7 @@ nextflow_process { script "modules/local/grohmm/transcriptcalling/main.nf" process "GROHMM_TRANSCRIPTCALLING" - test("Should run without a tuning file") { - + test("Should fail without a tuning file") { when { params { outdir = "$outputDir" @@ -15,13 +14,14 @@ nextflow_process { input[0] = [ [ id: 'mR1' ], [ file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S0mR1.bam", checkIfExists: true), - file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S40mR1.bam", checkIfExists: true) ] + file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S40mR1.bam", checkIfExists: true) ], + [], + [], ] input[1] = file( - "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.gtf", + "${projectDir}/modules/local/grohmm/parametertuning/tests/hg19.chr7.refGene.gtf", checkIfExists: true ) - input[2] = [] """ } } @@ -32,9 +32,33 @@ nextflow_process { { assert snapshot(process.out).match() } ) } - } - // TODO Add test with a tuning file + test("Should run with a tuning file") { + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id: 'mR1' ], + [ file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S0mR1.bam", checkIfExists: true), + file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S40mR1.bam", checkIfExists: true) ], + [], + file("${projectDir}/tests/config/tuningparams_small.csv", checkIfExists: true), + ] + input[1] = file( + "${projectDir}/modules/local/grohmm/parametertuning/tests/hg19.chr7.refGene.gtf", + checkIfExists: true + ) + """ + } + } + + then { + assert process.success + } + } } From 65d16d03f22c0ce355c4b368265547f45b89d3a6 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Sun, 22 Sep 2024 21:26:28 -0500 Subject: [PATCH 16/54] chore(grohmm): Add example tuning evals from tutorial --- tests/config/tuningparams_small.csv | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/config/tuningparams_small.csv b/tests/config/tuningparams_small.csv index 047b4186..e28ef149 100644 --- a/tests/config/tuningparams_small.csv +++ b/tests/config/tuningparams_small.csv @@ -1,3 +1,10 @@ -"LtProbB","UTS" --100,5 --200,5 +LtProbB,UTS,merged,dissociated,total,errorRate,txSize +-100,5,50,135,185,0.07769845,1391 +-100,10,61,177,238,0.07775237,2071 +-100,15,68,201,269,0.07441217,2625 +-200,5,64,47,111,0.06098901,830 +-200,10,74,65,139,0.06547339,1133 +-200,15,80,76,156,0.06643952,1358 +-300,5,69,22,91,0.05501814,664 +-300,10,82,30,112,0.06005362,875 +-300,15,90,41,131,0.06498016,1026 From e221791516e037604a0c4294661452b1dbe8b6a7 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Mon, 23 Sep 2024 10:34:51 -0500 Subject: [PATCH 17/54] fix(grohmm): Try removing any genes from "random" Chromosome --- bin/transcriptcalling_grohmm.R | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bin/transcriptcalling_grohmm.R b/bin/transcriptcalling_grohmm.R index d6995b52..6bcd5cd7 100755 --- a/bin/transcriptcalling_grohmm.R +++ b/bin/transcriptcalling_grohmm.R @@ -82,6 +82,8 @@ parser$add_argument( args <- parser$parse_args() +options(mc.cores = getCores(args$cores)) + setwd(args$outdir) # Load alignment files @@ -129,8 +131,13 @@ write.table( ) print("Input transcript annotations") -kg_db <- makeTxDbFromGFF(args$gtf) +kg_db <- makeTxDbFromGFF(args$gtf, format = "gtf") kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name")) +# TODO I wonder if I could speed things up by filtering by chromosome at the Nextflow level +# filter=list(tx_chrom="chr7")) +# exclude any transcripts that are located on chromosomes labeled with "random". +kg_tx <- kg_tx[grep("random", as.character(seqnames(kg_tx)), invert = TRUE), ] +print(kg_tx) print("Collapse annotations in preparation for overlap") kg_consensus <- makeConsensusAnnotations( kg_tx, From 0691cba31e46097dd6c7dbdd54b11b9fc6b25a7e Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Mon, 23 Sep 2024 17:01:48 -0500 Subject: [PATCH 18/54] test(grohmm): Use kgChr7 gtf --- modules/local/grohmm/parametertuning/tests/main.nf.test | 4 ++-- modules/local/grohmm/transcriptcalling/tests/main.nf.test | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/local/grohmm/parametertuning/tests/main.nf.test b/modules/local/grohmm/parametertuning/tests/main.nf.test index 039c164a..c86d126f 100644 --- a/modules/local/grohmm/parametertuning/tests/main.nf.test +++ b/modules/local/grohmm/parametertuning/tests/main.nf.test @@ -19,7 +19,7 @@ nextflow_process { [], ] input[1] = file( - "${projectDir}/modules/local/grohmm/parametertuning/tests/main.nf.test", + "https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/nascent/reference/kgChr7.gtf", checkIfExists: true ) """ @@ -50,7 +50,7 @@ nextflow_process { file("${projectDir}/tests/config/tuningparams_small.csv", checkIfExists: true), ] input[1] = file( - "${projectDir}/modules/local/grohmm/parametertuning/tests/hg19.chr7.refGene.gtf", + "https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/nascent/reference/kgChr7.gtf", checkIfExists: true ) """ diff --git a/modules/local/grohmm/transcriptcalling/tests/main.nf.test b/modules/local/grohmm/transcriptcalling/tests/main.nf.test index fc7d6e50..06387b5d 100644 --- a/modules/local/grohmm/transcriptcalling/tests/main.nf.test +++ b/modules/local/grohmm/transcriptcalling/tests/main.nf.test @@ -19,7 +19,7 @@ nextflow_process { [], ] input[1] = file( - "${projectDir}/modules/local/grohmm/parametertuning/tests/hg19.chr7.refGene.gtf", + "https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/nascent/reference/kgChr7.gtf", checkIfExists: true ) """ @@ -49,7 +49,7 @@ nextflow_process { file("${projectDir}/tests/config/tuningparams_small.csv", checkIfExists: true), ] input[1] = file( - "${projectDir}/modules/local/grohmm/parametertuning/tests/hg19.chr7.refGene.gtf", + "https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/nascent/reference/kgChr7.gtf", checkIfExists: true ) """ From 692e688e0dc8ce56febede7ccef70e421ee505f9 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Tue, 24 Sep 2024 10:50:35 -0500 Subject: [PATCH 19/54] test: Try it with refGene again --- modules/local/grohmm/parametertuning/tests/main.nf.test | 4 ++-- modules/local/grohmm/transcriptcalling/tests/main.nf.test | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/local/grohmm/parametertuning/tests/main.nf.test b/modules/local/grohmm/parametertuning/tests/main.nf.test index c86d126f..4c7fa796 100644 --- a/modules/local/grohmm/parametertuning/tests/main.nf.test +++ b/modules/local/grohmm/parametertuning/tests/main.nf.test @@ -19,7 +19,7 @@ nextflow_process { [], ] input[1] = file( - "https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/nascent/reference/kgChr7.gtf", + "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf", checkIfExists: true ) """ @@ -50,7 +50,7 @@ nextflow_process { file("${projectDir}/tests/config/tuningparams_small.csv", checkIfExists: true), ] input[1] = file( - "https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/nascent/reference/kgChr7.gtf", + "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf", checkIfExists: true ) """ diff --git a/modules/local/grohmm/transcriptcalling/tests/main.nf.test b/modules/local/grohmm/transcriptcalling/tests/main.nf.test index 06387b5d..bc6d31e4 100644 --- a/modules/local/grohmm/transcriptcalling/tests/main.nf.test +++ b/modules/local/grohmm/transcriptcalling/tests/main.nf.test @@ -19,7 +19,7 @@ nextflow_process { [], ] input[1] = file( - "https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/nascent/reference/kgChr7.gtf", + "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf", checkIfExists: true ) """ @@ -49,7 +49,7 @@ nextflow_process { file("${projectDir}/tests/config/tuningparams_small.csv", checkIfExists: true), ] input[1] = file( - "https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/nascent/reference/kgChr7.gtf", + "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf", checkIfExists: true ) """ From d9654915b8e9884f44127354b3a7ae91fd347da4 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Tue, 24 Sep 2024 10:51:06 -0500 Subject: [PATCH 20/54] fix(grohmm): Set memory.limit --- bin/parameter_tuning.R | 10 ++++++++++ bin/transcriptcalling_grohmm.R | 13 +++++++++++-- modules/local/grohmm/transcriptcalling/main.nf | 5 +++-- tests/nextflow.config | 8 ++++++++ 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/bin/parameter_tuning.R b/bin/parameter_tuning.R index 8c513a6a..0133390a 100755 --- a/bin/parameter_tuning.R +++ b/bin/parameter_tuning.R @@ -79,9 +79,19 @@ parser$add_argument( metavar = "integer", help = "Number of cores." ) +parser$add_argument( + "-m", + "--memory", + type = "integer", + default = 56000, + metavar = "integer", + help = "Amount of memory in MB" +) args <- parser$parse_args() +options(mc.cores = getCores(args$cores)) +memory.limit(size = args$memory) setwd(args$outdir) if (is.null(args$bam_files)) { diff --git a/bin/transcriptcalling_grohmm.R b/bin/transcriptcalling_grohmm.R index 6bcd5cd7..a006e783 100755 --- a/bin/transcriptcalling_grohmm.R +++ b/bin/transcriptcalling_grohmm.R @@ -79,11 +79,19 @@ parser$add_argument( metavar = "integer", help = "Number of cores." ) +parser$add_argument( + "-m", + "--memory", + type = "integer", + default = 56000, + metavar = "integer", + help = "Amount of memory in MB" +) args <- parser$parse_args() options(mc.cores = getCores(args$cores)) - +memory.limit(size = args$memory) setwd(args$outdir) # Load alignment files @@ -131,12 +139,13 @@ write.table( ) print("Input transcript annotations") -kg_db <- makeTxDbFromGFF(args$gtf, format = "gtf") +kg_db <- makeTxDbFromGFF(args$gtf) kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name")) # TODO I wonder if I could speed things up by filtering by chromosome at the Nextflow level # filter=list(tx_chrom="chr7")) # exclude any transcripts that are located on chromosomes labeled with "random". kg_tx <- kg_tx[grep("random", as.character(seqnames(kg_tx)), invert = TRUE), ] +print("Printing kg_tx.......") print(kg_tx) print("Collapse annotations in preparation for overlap") kg_consensus <- makeConsensusAnnotations( diff --git a/modules/local/grohmm/transcriptcalling/main.nf b/modules/local/grohmm/transcriptcalling/main.nf index f7df587f..8a6d35cd 100644 --- a/modules/local/grohmm/transcriptcalling/main.nf +++ b/modules/local/grohmm/transcriptcalling/main.nf @@ -5,8 +5,8 @@ process GROHMM_TRANSCRIPTCALLING { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'oras://community.wave.seqera.io/library/grohmm:a660d9c3942c9b85' : - 'community.wave.seqera.io/library/grohmm:780b8693bdaa87b9' }" + 'oras://community.wave.seqera.io/library/grohmm:a660d9c3942c9b85' : + 'community.wave.seqera.io/library/grohmm:780b8693bdaa87b9' }" input: tuple val(meta), path(bams), path(bais), path(tuning_file) @@ -34,6 +34,7 @@ process GROHMM_TRANSCRIPTCALLING { --gtf $gtf \\ --outdir ./ \\ --cores $task.cpus \\ + --memory ${task.memory.toMega()} \\ $args cat <<-END_VERSIONS > versions.yml diff --git a/tests/nextflow.config b/tests/nextflow.config index 97eb3736..674e4eb7 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -48,4 +48,12 @@ process { // HACK Tests fail after latest modules update ext.args = { "--disable-small" } } + + // HACK https://stackoverflow.com/a/23419332 + // groHMM uses a lot of memory, so if we run things in parallel it'll start randomly killing jobs + withName: 'GROHMM_.*' { + cpus = 1 + memory = 30.GB + time = 2.h + } } From 013978d02ef5efc94d91085f2dd6bbd50706513a Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Tue, 24 Sep 2024 11:08:23 -0500 Subject: [PATCH 21/54] fix(grohmm): Remove keytype It's in the vignette but the actual function call and docs don't have a use for it. It just defaults to gene_id. https://github.com/bioinformaticsorphanage/groHMM/blob/86486c388d12a636041c1fc40d1308f96abb5161/R/makeConsensusAnnotations.R#L47-L53 --- bin/parameter_tuning.R | 1 - bin/transcriptcalling_grohmm.R | 2 +- modules/local/grohmm/parametertuning/main.nf | 5 +- .../grohmm/parametertuning/tests/main.nf.test | 41 +++---------- .../parametertuning/tests/main.nf.test.snap | 34 +++++++++++ .../transcriptcalling/tests/main.nf.test | 11 +++- .../transcriptcalling/tests/main.nf.test.snap | 58 +++++++++++++++++++ tests/nextflow.config | 8 --- 8 files changed, 113 insertions(+), 47 deletions(-) create mode 100644 modules/local/grohmm/parametertuning/tests/main.nf.test.snap create mode 100644 modules/local/grohmm/transcriptcalling/tests/main.nf.test.snap diff --git a/bin/parameter_tuning.R b/bin/parameter_tuning.R index 0133390a..168ad777 100755 --- a/bin/parameter_tuning.R +++ b/bin/parameter_tuning.R @@ -116,7 +116,6 @@ kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name")) print("Collapse annotations in preparation for overlap") kg_consensus <- makeConsensusAnnotations( kg_tx, - keytype = "gene_id", mc.cores = args$cores ) print("Finished consensus annotations") diff --git a/bin/transcriptcalling_grohmm.R b/bin/transcriptcalling_grohmm.R index a006e783..62134e1b 100755 --- a/bin/transcriptcalling_grohmm.R +++ b/bin/transcriptcalling_grohmm.R @@ -142,6 +142,7 @@ print("Input transcript annotations") kg_db <- makeTxDbFromGFF(args$gtf) kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name")) # TODO I wonder if I could speed things up by filtering by chromosome at the Nextflow level +# https://github.com/google/deepvariant/issues/744 # filter=list(tx_chrom="chr7")) # exclude any transcripts that are located on chromosomes labeled with "random". kg_tx <- kg_tx[grep("random", as.character(seqnames(kg_tx)), invert = TRUE), ] @@ -150,7 +151,6 @@ print(kg_tx) print("Collapse annotations in preparation for overlap") kg_consensus <- makeConsensusAnnotations( kg_tx, - keytype = "gene_id", mc.cores = args$cores ) print("Finished consensus annotations") diff --git a/modules/local/grohmm/parametertuning/main.nf b/modules/local/grohmm/parametertuning/main.nf index ba0db7ba..f4c3865f 100644 --- a/modules/local/grohmm/parametertuning/main.nf +++ b/modules/local/grohmm/parametertuning/main.nf @@ -1,13 +1,12 @@ process GROHMM_PARAMETERTUNING { tag "$meta.id|$UTS|$LtProbB" - debug true label 'process_high' label 'process_long' conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'oras://community.wave.seqera.io/library/grohmm:a660d9c3942c9b85' : - 'community.wave.seqera.io/library/grohmm:780b8693bdaa87b9' }" + 'oras://community.wave.seqera.io/library/grohmm:a660d9c3942c9b85' : + 'community.wave.seqera.io/library/grohmm:780b8693bdaa87b9' }" input: tuple val(meta), path(bams), path(bais) diff --git a/modules/local/grohmm/parametertuning/tests/main.nf.test b/modules/local/grohmm/parametertuning/tests/main.nf.test index 4c7fa796..4a6d615b 100644 --- a/modules/local/grohmm/parametertuning/tests/main.nf.test +++ b/modules/local/grohmm/parametertuning/tests/main.nf.test @@ -4,7 +4,7 @@ nextflow_process { script "modules/local/grohmm/parametertuning/main.nf" process "GROHMM_PARAMETERTUNING" - test("Should fail with a tuning file") { + test("Should run with defaults") { when { params { outdir = "$outputDir" @@ -16,50 +16,25 @@ nextflow_process { [ file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S0mR1.bam", checkIfExists: true), file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S40mR1.bam", checkIfExists: true) ], [], - [], ] input[1] = file( "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf", checkIfExists: true ) + input[2] = 5 + input[3] = -100 """ } } then { - // FIXME Broken test assertAll( - { assert process.failed }, - { assert snapshot(process.out).match() } + { assert process.success }, + { assert snapshot( + process.out.tuning, + ).match() }, + { assert snapshot(path(process.out.versions.get(0)).yaml).match("versions") }, ) } } - - test("Should run with a tuning file and without bai files") { - when { - params { - outdir = "$outputDir" - } - process { - """ - input[0] = [ - [ id: 'Sall' ], - [ file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S0mR1.bam", checkIfExists: true), - file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S40mR1.bam", checkIfExists: true) ], - [], - file("${projectDir}/tests/config/tuningparams_small.csv", checkIfExists: true), - ] - input[1] = file( - "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf", - checkIfExists: true - ) - """ - } - } - - then { - // TODO - assert process.failed - } - } } diff --git a/modules/local/grohmm/parametertuning/tests/main.nf.test.snap b/modules/local/grohmm/parametertuning/tests/main.nf.test.snap new file mode 100644 index 00000000..59516a3f --- /dev/null +++ b/modules/local/grohmm/parametertuning/tests/main.nf.test.snap @@ -0,0 +1,34 @@ +{ + "versions": { + "content": [ + { + "GROHMM_PARAMETERTUNING": { + "r-base": "4.3.3", + "bioconductor-grohmm": "1.39.0" + } + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-24T11:44:17.664038682" + }, + "Should run with defaults": { + "content": [ + [ + [ + { + "id": "Sall" + }, + "Sall_5_-100.tuning.csv:md5,abe7dd590cb307eff7abb293eec38190" + ] + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-24T11:44:17.28827251" + } +} \ No newline at end of file diff --git a/modules/local/grohmm/transcriptcalling/tests/main.nf.test b/modules/local/grohmm/transcriptcalling/tests/main.nf.test index bc6d31e4..0395a6fc 100644 --- a/modules/local/grohmm/transcriptcalling/tests/main.nf.test +++ b/modules/local/grohmm/transcriptcalling/tests/main.nf.test @@ -57,7 +57,16 @@ nextflow_process { } then { - assert process.success + assertAll( + { assert process.success }, + { assert snapshot( + process.out.transcripts, + process.out.eval, + process.out.transcripts_bed, + process.out.td, + ).match() }, + { assert snapshot(path(process.out.versions.get(0)).yaml).match("versions") }, + ) } } diff --git a/modules/local/grohmm/transcriptcalling/tests/main.nf.test.snap b/modules/local/grohmm/transcriptcalling/tests/main.nf.test.snap new file mode 100644 index 00000000..04482942 --- /dev/null +++ b/modules/local/grohmm/transcriptcalling/tests/main.nf.test.snap @@ -0,0 +1,58 @@ +{ + "versions": { + "content": [ + { + "GROHMM_TRANSCRIPTCALLING": { + "r-base": "4.3.3", + "bioconductor-grohmm": "1.39.0" + } + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-24T11:27:57.580852329" + }, + "Should run with a tuning file": { + "content": [ + [ + [ + { + "id": "mR1" + }, + "mR1.transcripts.txt:md5,06690e4932d6f2d597c2f482fb0848d2" + ] + ], + [ + [ + { + "id": "mR1" + }, + "mR1.eval.txt:md5,4dc82be9170bf40cdb88e187d805f9df" + ] + ], + [ + [ + { + "id": "mR1" + }, + "mR1.final.transcripts.bed:md5,db367d8482ee6b88af176c35e58807d9" + ] + ], + [ + [ + { + "id": "mR1" + }, + "mR1.tdFinal.txt:md5,24d8292ec9f9ab8e2aac99fb6061d26c" + ] + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-24T11:27:57.184918004" + } +} \ No newline at end of file diff --git a/tests/nextflow.config b/tests/nextflow.config index 674e4eb7..97eb3736 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -48,12 +48,4 @@ process { // HACK Tests fail after latest modules update ext.args = { "--disable-small" } } - - // HACK https://stackoverflow.com/a/23419332 - // groHMM uses a lot of memory, so if we run things in parallel it'll start randomly killing jobs - withName: 'GROHMM_.*' { - cpus = 1 - memory = 30.GB - time = 2.h - } } From af3cba819077ad121bc1afcf729a5d83cfdba959 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Tue, 24 Sep 2024 12:21:45 -0500 Subject: [PATCH 22/54] refactor(grohmm): Get jobs running for every set of possibilities --- conf/modules.config | 2 +- conf/test.config | 5 +++-- modules/local/grohmm/parametertuning/main.nf | 6 ++---- .../local/grohmm/parametertuning/tests/main.nf.test | 4 ++-- subworkflows/local/grohmm/main.nf | 11 +++++++---- 5 files changed, 15 insertions(+), 13 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index ebb6cbe1..a38a87c0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -367,7 +367,7 @@ process { ] } - withName: GROHMM_TRANSCRIPTCALLING { + withName: "GROHMM_.*" { publishDir = [ [ path: { "${params.outdir}/transcript_identification/grohmm" }, diff --git a/conf/test.config b/conf/test.config index 7f46f87a..52ee758d 100644 --- a/conf/test.config +++ b/conf/test.config @@ -35,10 +35,11 @@ params { skip_grohmm = true // FIXME Fails due to higher memory requirements // tuning_file = "${projectDir}/tests/config/tuningparams_small.csv" tuning_file = null + // Just run 2 jobs instead of 8 grohmm_min_uts = 5 - grohmm_max_uts = 10 + grohmm_max_uts = 5 grohmm_min_ltprobb = -100 - grohmm_max_ltprobb = -150 + grohmm_max_ltprobb = -100 filter_bed = "${projectDir}/tests/config/unwanted_region.bed" intersect_bed = "${projectDir}/tests/config/wanted_region.bed" } diff --git a/modules/local/grohmm/parametertuning/main.nf b/modules/local/grohmm/parametertuning/main.nf index f4c3865f..7cb8828f 100644 --- a/modules/local/grohmm/parametertuning/main.nf +++ b/modules/local/grohmm/parametertuning/main.nf @@ -1,5 +1,5 @@ process GROHMM_PARAMETERTUNING { - tag "$meta.id|$UTS|$LtProbB" + tag "$meta.id|$UTS| $LtProbB" label 'process_high' label 'process_long' @@ -9,10 +9,8 @@ process GROHMM_PARAMETERTUNING { 'community.wave.seqera.io/library/grohmm:780b8693bdaa87b9' }" input: - tuple val(meta), path(bams), path(bais) + tuple val(meta), path(bams), path(bais), val(UTS), val(LtProbB) path gtf - val UTS - val LtProbB output: tuple val(meta), path("*.tuning.csv"), emit: tuning diff --git a/modules/local/grohmm/parametertuning/tests/main.nf.test b/modules/local/grohmm/parametertuning/tests/main.nf.test index 4a6d615b..022cb455 100644 --- a/modules/local/grohmm/parametertuning/tests/main.nf.test +++ b/modules/local/grohmm/parametertuning/tests/main.nf.test @@ -16,13 +16,13 @@ nextflow_process { [ file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S0mR1.bam", checkIfExists: true), file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S40mR1.bam", checkIfExists: true) ], [], + 5, + -100, ] input[1] = file( "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf", checkIfExists: true ) - input[2] = 5 - input[3] = -100 """ } } diff --git a/subworkflows/local/grohmm/main.nf b/subworkflows/local/grohmm/main.nf index 15b867b6..145b362d 100644 --- a/subworkflows/local/grohmm/main.nf +++ b/subworkflows/local/grohmm/main.nf @@ -26,12 +26,15 @@ workflow GROHMM { // 5..45 by 5 for UTS is what we had currently ch_uts = channel.fromList((params.grohmm_min_uts..params.grohmm_max_uts).step(5)) // -100..-400 by 50 for LtProbB - ch_ltprobb = channel.fromList((params.grohmm_min_ltprobb..params.grohmm_max_ltprobb).step(50)).view() + ch_ltprobb = channel.fromList((params.grohmm_min_ltprobb..params.grohmm_max_ltprobb).step(50)) + ch_bams_bais_uts_ltprobb = + bams_bais + .combine(ch_uts) + .combine(ch_ltprobb) + GROHMM_PARAMETERTUNING ( - bams_bais, + ch_bams_bais_uts_ltprobb, gtf, - ch_uts, - ch_ltprobb, ) .tuning .collectFile( From a5e2bcc0bd37446ab8d89723bb0b060750359cfd Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Tue, 24 Sep 2024 13:18:23 -0500 Subject: [PATCH 23/54] test(grohmm): Test the whole subworkflow --- subworkflows/local/grohmm/tests/main.nf.test | 101 +++++------------- .../local/grohmm/tests/main.nf.test.snap | 33 ++++++ 2 files changed, 59 insertions(+), 75 deletions(-) create mode 100644 subworkflows/local/grohmm/tests/main.nf.test.snap diff --git a/subworkflows/local/grohmm/tests/main.nf.test b/subworkflows/local/grohmm/tests/main.nf.test index 59b159d0..38115248 100644 --- a/subworkflows/local/grohmm/tests/main.nf.test +++ b/subworkflows/local/grohmm/tests/main.nf.test @@ -1,6 +1,6 @@ nextflow_workflow { - name "Test Workflow GROHMM" + name "Test subworkflow GROHMM" script "../main.nf" workflow "GROHMM" @@ -8,81 +8,24 @@ nextflow_workflow { when { params { outdir = "$outputDir" - skip_tuning = true + grohmm_min_uts = 5 + grohmm_max_uts = 10 + grohmm_min_ltprobb = -100 + grohmm_max_ltprobb = -150 } workflow { """ - input[0] = [ - [ id: 'mR1' ], - [ file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/illumina/bam/test.single_end.sorted.bam", checkIfExists: true), ] - ] - // FIXME This needs to match up with s40 - input[1] = file( - "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/genome/genome.gtf", - checkIfExists: true - ) - input[2] = [] - """ - } - } - - then { - assertAll( - { assert workflow.success }, - { assert snapshot(workflow.out).match() } - ) - } - - } - - test("Should run with multiple bams") { - when { - params { - outdir = "$outputDir" - } - workflow { - """ - input[0] = [ - [ id: 'mR1' ], - [ file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S0mR1.bam", checkIfExists: true), - file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S40mR1.bam", checkIfExists: true) ] - ] - input[1] = file( - "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.gtf", - checkIfExists: true - ) - input[2] = [] - """ - } - } - - then { - assertAll( - { assert workflow.success }, - { assert snapshot(workflow.out).match() } - ) - } - - } - - test("Should run with a tuning file") { - - when { - params { - outdir = "$outputDir" - } - workflow { - """ - input[0] = [ - [ id: 'mR1' ], + input[0] = Channel.of([ + [ id: 'Sall' ], [ file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S0mR1.bam", checkIfExists: true), - file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S40mR1.bam", checkIfExists: true) ] - ] - input[1] = file( - "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.gtf", - checkIfExists: true - ) - input[2] = file("https://raw.githubusercontent.com/nf-core/test-datasets/nascent/misc/tune.csv", checkIfExists: true) + file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S40mR1.bam", checkIfExists: true) ], + [], + ]) + input[1] = Channel.of([file( + "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf", + checkIfExists: true + )]) + input[2] = Channel.of([]) """ } } @@ -90,10 +33,18 @@ nextflow_workflow { then { assertAll( { assert workflow.success }, - { assert snapshot(workflow.out).match() } + { assert snapshot( + workflow.trace.tasks().size(), + workflow.out.transcripts, + workflow.out.bed, + // workflow.out.td_plot, + ).match() + }, + { assert snapshot( + path(workflow.out.versions.get(0)).yaml) + .match("versions") + }, ) } - } - } diff --git a/subworkflows/local/grohmm/tests/main.nf.test.snap b/subworkflows/local/grohmm/tests/main.nf.test.snap new file mode 100644 index 00000000..ba71da17 --- /dev/null +++ b/subworkflows/local/grohmm/tests/main.nf.test.snap @@ -0,0 +1,33 @@ +{ + "versions": { + "content": [ + { + "GROHMM:GROHMM_PARAMETERTUNING": { + "r-base": "4.3.3", + "bioconductor-grohmm": "1.39.0" + } + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-24T12:58:04.996402914" + }, + "Should run without a tuning file": { + "content": [ + 1, + [ + + ], + [ + + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-24T12:58:04.685610798" + } +} \ No newline at end of file From 7807899d221707b3de56821d5743308cf131559f Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Tue, 24 Sep 2024 13:43:47 -0500 Subject: [PATCH 24/54] fix(grohmm): Give up on taking a tuning file Can't support everything, if you're gonna run groHMM, you're just going to run the tuning. If you want to minimize the runs, set the grohmm_{min,max}_uts, and grohmm_{min,max}_ltprobb variables to the desired values --- CHANGELOG.md | 4 ++ bin/parameter_tuning.R | 8 --- conf/test.config | 2 - nextflow.config | 1 - nextflow_schema.json | 15 ----- subworkflows/local/grohmm/main.nf | 59 ++++++++----------- subworkflows/local/grohmm/tests/main.nf.test | 11 ++-- .../local/grohmm/tests/main.nf.test.snap | 4 +- .../local/transcript_identification.nf | 3 +- tests/nextflow.config | 1 - .../grohmm/skip_tuning/main.nf.test | 3 +- 11 files changed, 40 insertions(+), 71 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b134164..d09edad9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#170](https://github.com/nf-core/nascent/pull/170) - Remove "Access to undefined parameter forwardStranded" warnings +### Removed + +- Support for groHMM tuning files + ## v2.2.0 - 2024-03-05 ### Added diff --git a/bin/parameter_tuning.R b/bin/parameter_tuning.R index 168ad777..f207c9bb 100755 --- a/bin/parameter_tuning.R +++ b/bin/parameter_tuning.R @@ -16,14 +16,6 @@ parser$add_argument( help = "GRO SEQ data in bam files.", required = TRUE ) -parser$add_argument( - "-t", - "--tuning_file", - type = "character", - default = NULL, - metavar = "path", - help = "File with tuning parameters and error rates." -) parser$add_argument( "-o", "--outdir", diff --git a/conf/test.config b/conf/test.config index 52ee758d..e841fb11 100644 --- a/conf/test.config +++ b/conf/test.config @@ -33,8 +33,6 @@ params { assay_type = "GROseq" skip_grohmm = true // FIXME Fails due to higher memory requirements - // tuning_file = "${projectDir}/tests/config/tuningparams_small.csv" - tuning_file = null // Just run 2 jobs instead of 8 grohmm_min_uts = 5 grohmm_max_uts = 5 diff --git a/nextflow.config b/nextflow.config index e1cfc965..36c88a4a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -25,7 +25,6 @@ params { // Transcript identification method assay_type = null skip_grohmm = false - tuning_file = null grohmm_min_uts = 5 grohmm_max_uts = 45 // Depends on how you look at this one... But I figured most will ignore the negative diff --git a/nextflow_schema.json b/nextflow_schema.json index 02574536..60465a52 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -117,12 +117,6 @@ "R2_3" ] }, - "skip_tuning": { - "type": "boolean", - "description": "Skip groHMM tuning step as it can take a long time.", - "fa_icon": "fas fa-wrench", - "hidden": true - }, "skip_grohmm": { "type": "boolean", "description": "Skip groHMM all together", @@ -155,15 +149,6 @@ "help_text": "Depends on how you look at this one, which is the minimum and maximum... But I figured most will ignore the negative, so we went with absolute values.", "default": -400 }, - "tuning_file": { - "type": "string", - "format": "file-path", - "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", - "hidden": true, - "fa_icon": "fas fa-file-csv", - "description": "File of parameters to test for groHMM tuning" - }, "filter_bed": { "type": "string", "fa_icon": "fas fa-filter", diff --git a/subworkflows/local/grohmm/main.nf b/subworkflows/local/grohmm/main.nf index 145b362d..67f2a8e5 100644 --- a/subworkflows/local/grohmm/main.nf +++ b/subworkflows/local/grohmm/main.nf @@ -12,47 +12,40 @@ workflow GROHMM { take: bams_bais gtf - tuning_file + // TODO Support rerunning with a tuning file + // tuning_file main: ch_versions = Channel.empty() - ch_tuning = Channel.empty() - - // if(!tuning_file) { - // Run transcriptcalling eval for each tuning param - // Should avoid a tuning file with a row for everything - // 5..45 by 5 for UTS is what we had currently - ch_uts = channel.fromList((params.grohmm_min_uts..params.grohmm_max_uts).step(5)) - // -100..-400 by 50 for LtProbB - ch_ltprobb = channel.fromList((params.grohmm_min_ltprobb..params.grohmm_max_ltprobb).step(50)) - ch_bams_bais_uts_ltprobb = - bams_bais - .combine(ch_uts) - .combine(ch_ltprobb) - - GROHMM_PARAMETERTUNING ( - ch_bams_bais_uts_ltprobb, - gtf, + // Run transcriptcalling eval for each tuning param + // Should avoid a tuning file with a row for everything + // 5..45 by 5 for UTS is what we had currently + ch_uts = channel.fromList((params.grohmm_min_uts..params.grohmm_max_uts).step(5)) + // -100..-400 by 50 for LtProbB + ch_ltprobb = channel.fromList((params.grohmm_min_ltprobb..params.grohmm_max_ltprobb).step(50)) + ch_bams_bais_uts_ltprobb = + bams_bais + .combine(ch_uts) + .combine(ch_ltprobb) + + GROHMM_PARAMETERTUNING ( + ch_bams_bais_uts_ltprobb, + gtf, + ) + .tuning + .collectFile( + name: "${params.outdir}/transcript_identification/grohmm/tuning.csv", + keepHeader: true, + skip: 1, + newLine: true, ) - .tuning - .collectFile( - name: "${params.outdir}/transcript_identification/grohmm/tuning.csv", - keepHeader: true, - skip: 1, - newLine: true, - ) - .set { tuning } + .set { ch_tuning } + ch_versions = ch_versions.mix(GROHMM_PARAMETERTUNING.out.versions.first()) - ch_bams_bais_tuning = bams_bais.join(tuning, by: [0]) + ch_bams_bais_tuning = bams_bais.join(ch_tuning, by: [0]) - ch_versions = ch_versions.mix(GROHMM_PARAMETERTUNING.out.versions.first()) - // } else { - // If a tuning file is provided, run transcriptcalling once - // NOTE This doesn't really handle multiple "groups well" - // ch_bams_bais_tuning = bams_bais.join(tuning_file) - // } GROHMM_TRANSCRIPTCALLING ( ch_bams_bais_tuning, diff --git a/subworkflows/local/grohmm/tests/main.nf.test b/subworkflows/local/grohmm/tests/main.nf.test index 38115248..bf1a4874 100644 --- a/subworkflows/local/grohmm/tests/main.nf.test +++ b/subworkflows/local/grohmm/tests/main.nf.test @@ -4,14 +4,14 @@ nextflow_workflow { script "../main.nf" workflow "GROHMM" - test("Should run without a tuning file") { + test("Should run without a tuning file and multiplex") { when { params { outdir = "$outputDir" - grohmm_min_uts = 5 - grohmm_max_uts = 10 - grohmm_min_ltprobb = -100 - grohmm_max_ltprobb = -150 + grohmm_min_uts = 5 + grohmm_max_uts = 10 + grohmm_min_ltprobb = -100 + grohmm_max_ltprobb = -150 } workflow { """ @@ -25,7 +25,6 @@ nextflow_workflow { "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf", checkIfExists: true )]) - input[2] = Channel.of([]) """ } } diff --git a/subworkflows/local/grohmm/tests/main.nf.test.snap b/subworkflows/local/grohmm/tests/main.nf.test.snap index ba71da17..f513363b 100644 --- a/subworkflows/local/grohmm/tests/main.nf.test.snap +++ b/subworkflows/local/grohmm/tests/main.nf.test.snap @@ -14,7 +14,7 @@ }, "timestamp": "2024-09-24T12:58:04.996402914" }, - "Should run without a tuning file": { + "Should run without a tuning file and multiplex": { "content": [ 1, [ @@ -28,6 +28,6 @@ "nf-test": "0.9.0", "nextflow": "24.04.4" }, - "timestamp": "2024-09-24T12:58:04.685610798" + "timestamp": "2024-09-24T13:38:02.218159522" } } \ No newline at end of file diff --git a/subworkflows/local/transcript_identification.nf b/subworkflows/local/transcript_identification.nf index a13082b4..b1217a90 100644 --- a/subworkflows/local/transcript_identification.nf +++ b/subworkflows/local/transcript_identification.nf @@ -23,10 +23,9 @@ workflow TRANSCRIPT_INDENTIFICATION { ch_versions = Channel.empty() ch_identification_bed = Channel.empty() - ch_tuning_file = params.tuning_file ? file(params.tuning_file, checkIfExists: true) : file("${projectDir}/assets/tuningparamstotest.csv") grohmm_td_plot = Channel.empty() if(!params.skip_grohmm && params.assay_type == "GROseq") { - GROHMM ( group_bam_bai, gtf, ch_tuning_file ) + GROHMM ( group_bam_bai, gtf ) ch_identification_bed = ch_identification_bed.mix(GROHMM.out.bed) grohmm_td_plot = GROHMM.out.td_plot ch_versions = ch_versions.mix(GROHMM.out.versions.first()) diff --git a/tests/nextflow.config b/tests/nextflow.config index 97eb3736..a029ac75 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -38,7 +38,6 @@ params { assay_type = "GROseq" skip_grohmm = true // FIXME Fails due to higher memory requirements - tuning_file = "${projectDir}/tests/config/tuningparams_small.csv" filter_bed = "${projectDir}/tests/config/unwanted_region.bed" intersect_bed = "${projectDir}/tests/config/wanted_region.bed" } diff --git a/workflows/tests/transcript_indentification/grohmm/skip_tuning/main.nf.test b/workflows/tests/transcript_indentification/grohmm/skip_tuning/main.nf.test index b9d975bd..4885b0b7 100644 --- a/workflows/tests/transcript_indentification/grohmm/skip_tuning/main.nf.test +++ b/workflows/tests/transcript_indentification/grohmm/skip_tuning/main.nf.test @@ -4,6 +4,7 @@ nextflow_pipeline { script "../../../main.nf" tag "groHMM" + // TODO Support a tuning file test("Should be able to skip tuning") { when { @@ -16,7 +17,7 @@ nextflow_pipeline { then { assertAll( - { assert workflow.success }, + { assert workflow.failure }, { assert snapshot(UTILS.removeNextflowVersion("$outputDir/pipeline_info/nf_core_pipeline_software_mqc_versions.yml")).match("software_versions") }, { assert snapshot( workflow.trace.tasks().size(), From d67acdee575760072af38ee3e468100d7eb4597f Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Tue, 24 Sep 2024 16:12:01 -0500 Subject: [PATCH 25/54] test(grohmm): Fix how the channel is created to avoid exhausting it https://midnighter.github.io/nextflow-gotchas/gotchas/singleton-channel/ --- subworkflows/local/grohmm/tests/main.nf.test | 2 +- subworkflows/local/grohmm/tests/main.nf.test.snap | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/grohmm/tests/main.nf.test b/subworkflows/local/grohmm/tests/main.nf.test index bf1a4874..905fbf5d 100644 --- a/subworkflows/local/grohmm/tests/main.nf.test +++ b/subworkflows/local/grohmm/tests/main.nf.test @@ -24,7 +24,7 @@ nextflow_workflow { input[1] = Channel.of([file( "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf", checkIfExists: true - )]) + )]).first() """ } } diff --git a/subworkflows/local/grohmm/tests/main.nf.test.snap b/subworkflows/local/grohmm/tests/main.nf.test.snap index f513363b..d5b4fa62 100644 --- a/subworkflows/local/grohmm/tests/main.nf.test.snap +++ b/subworkflows/local/grohmm/tests/main.nf.test.snap @@ -16,7 +16,7 @@ }, "Should run without a tuning file and multiplex": { "content": [ - 1, + 4, [ ], @@ -28,6 +28,6 @@ "nf-test": "0.9.0", "nextflow": "24.04.4" }, - "timestamp": "2024-09-24T13:38:02.218159522" + "timestamp": "2024-09-24T16:07:41.806165357" } } \ No newline at end of file From 09212f9bf0b110e84bf308e1007bc26921c3b78e Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Tue, 24 Sep 2024 18:05:20 -0500 Subject: [PATCH 26/54] fix(grohmm): Get transcript calling running --- modules/local/grohmm/parametertuning/main.nf | 2 +- subworkflows/local/grohmm/main.nf | 19 ++++++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/modules/local/grohmm/parametertuning/main.nf b/modules/local/grohmm/parametertuning/main.nf index 7cb8828f..b865f407 100644 --- a/modules/local/grohmm/parametertuning/main.nf +++ b/modules/local/grohmm/parametertuning/main.nf @@ -1,5 +1,5 @@ process GROHMM_PARAMETERTUNING { - tag "$meta.id|$UTS| $LtProbB" + tag "$meta.id|$UTS|$LtProbB" label 'process_high' label 'process_long' diff --git a/subworkflows/local/grohmm/main.nf b/subworkflows/local/grohmm/main.nf index 67f2a8e5..f3f5b3b4 100644 --- a/subworkflows/local/grohmm/main.nf +++ b/subworkflows/local/grohmm/main.nf @@ -36,19 +36,28 @@ workflow GROHMM { ) .tuning .collectFile( - name: "${params.outdir}/transcript_identification/grohmm/tuning.csv", keepHeader: true, skip: 1, newLine: true, + storeDir: "${params.outdir}/transcript_identification/grohmm/", ) + { meta, file -> + filename = "${meta.id}.${meta.single_end ? 'SE': 'PE' }.tuning.csv" + [filename, file.text] + } + .map { path -> + meta = [ + id:path.getSimpleName(), + single_end: path.getName().split("\\.")[1] == 'SE' ? true : false + ] + [meta, file(path)] + } .set { ch_tuning } - ch_versions = ch_versions.mix(GROHMM_PARAMETERTUNING.out.versions.first()) - - ch_bams_bais_tuning = bams_bais.join(ch_tuning, by: [0]) + ch_versions = ch_versions.mix(GROHMM_PARAMETERTUNING.out.versions.first()) GROHMM_TRANSCRIPTCALLING ( - ch_bams_bais_tuning, + bams_bais.join(ch_tuning, by: [0]), gtf, ) ch_versions = ch_versions.mix(GROHMM_TRANSCRIPTCALLING.out.versions.first()) From d2e40651d8a66d16c33ef15902d3c5c7191a798f Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Tue, 24 Sep 2024 19:53:43 -0500 Subject: [PATCH 27/54] fix(grohmm): Clean up tuning file to match --- bin/parameter_tuning.R | 19 ++++++++++++++++++- bin/transcriptcalling_grohmm.R | 1 + subworkflows/local/grohmm/main.nf | 2 +- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/bin/parameter_tuning.R b/bin/parameter_tuning.R index f207c9bb..791e3e0e 100755 --- a/bin/parameter_tuning.R +++ b/bin/parameter_tuning.R @@ -116,6 +116,10 @@ print("Finished consensus annotations") ## TUNING ## ############ print("Starting tuning run") +tune <- data.frame( + LtProbB = args$ltprobb, + UTS = args$uts +) Fp <- windowAnalysis(alignments, strand = "+", windowSize = 50) Fm <- windowAnalysis(alignments, strand = "-", windowSize = 50) hmm <- detectTranscripts( @@ -127,10 +131,23 @@ hmm <- detectTranscripts( ) print("Evaluating") e <- evaluateHMMInAnnotations(hmm$transcripts, kg_consensus) + +# Extract evaluation metrics and convert to a data frame +eval_metrics <- as.data.frame(e$eval) + +# If eval_metrics is a list of lists, unlist it +if (is.list(eval_metrics[[1]])) { + eval_metrics <- as.data.frame(t(sapply(e$eval, unlist))) +} + +# Combine the tuning parameters with the evaluation metrics +tune <- cbind(tune, eval_metrics) + print(e$eval) print(e) -write.csv(e$eval, file = paste0(args$outprefix, ".tuning.csv")) +# Write the combined data to a CSV file without row names +write.csv(tune, file = paste0(args$outprefix, ".tuning.csv"), row.names = FALSE) ######################## ## CITE PACKAGES USED ## diff --git a/bin/transcriptcalling_grohmm.R b/bin/transcriptcalling_grohmm.R index 62134e1b..e5eeef55 100755 --- a/bin/transcriptcalling_grohmm.R +++ b/bin/transcriptcalling_grohmm.R @@ -91,6 +91,7 @@ parser$add_argument( args <- parser$parse_args() options(mc.cores = getCores(args$cores)) +## Windows Specific doesn't actually do anything memory.limit(size = args$memory) setwd(args$outdir) diff --git a/subworkflows/local/grohmm/main.nf b/subworkflows/local/grohmm/main.nf index f3f5b3b4..bec93af9 100644 --- a/subworkflows/local/grohmm/main.nf +++ b/subworkflows/local/grohmm/main.nf @@ -38,7 +38,7 @@ workflow GROHMM { .collectFile( keepHeader: true, skip: 1, - newLine: true, + newLine: false, storeDir: "${params.outdir}/transcript_identification/grohmm/", ) { meta, file -> From 2286b7aeee0b9afe210a65c7285c572a262c6a8a Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Wed, 25 Sep 2024 16:02:11 -0500 Subject: [PATCH 28/54] fix(grohmm): Update groHMM with fix Now it handles when there are no broken transcripts(ie with a test dataset) --- bin/transcriptcalling_grohmm.R | 3 ++ conf/test.config | 5 ++-- .../grohmm/parametertuning/environment.yml | 4 +-- modules/local/grohmm/parametertuning/main.nf | 4 +-- .../grohmm/transcriptcalling/environment.yml | 2 +- .../local/grohmm/transcriptcalling/main.nf | 4 +-- .../grohmm/skip_tuning/main.nf.test | 28 +------------------ 7 files changed, 13 insertions(+), 37 deletions(-) diff --git a/bin/transcriptcalling_grohmm.R b/bin/transcriptcalling_grohmm.R index e5eeef55..5600ca54 100755 --- a/bin/transcriptcalling_grohmm.R +++ b/bin/transcriptcalling_grohmm.R @@ -120,6 +120,9 @@ if (is.null(args$tuning_file)) { # Minimum error uts <- tune[which.min(tune$errorRate), "UTS"] lt_probb <- tune[which.min(tune$errorRate), "LtProbB"] + # Print the minimums for debugging + cat("minimum uts:", uts) + cat("minimum lt_probb:", lt_probb) hmm_result <- detectTranscripts( alignments, LtProbB = lt_probb, diff --git a/conf/test.config b/conf/test.config index e841fb11..a7a9d6f1 100644 --- a/conf/test.config +++ b/conf/test.config @@ -33,11 +33,10 @@ params { assay_type = "GROseq" skip_grohmm = true // FIXME Fails due to higher memory requirements - // Just run 2 jobs instead of 8 grohmm_min_uts = 5 - grohmm_max_uts = 5 + grohmm_max_uts = 10 grohmm_min_ltprobb = -100 - grohmm_max_ltprobb = -100 + grohmm_max_ltprobb = -150 filter_bed = "${projectDir}/tests/config/unwanted_region.bed" intersect_bed = "${projectDir}/tests/config/wanted_region.bed" } diff --git a/modules/local/grohmm/parametertuning/environment.yml b/modules/local/grohmm/parametertuning/environment.yml index c77ab0b6..5972f503 100644 --- a/modules/local/grohmm/parametertuning/environment.yml +++ b/modules/local/grohmm/parametertuning/environment.yml @@ -7,8 +7,8 @@ dependencies: - conda-forge::r-base # =4.1.1 - conda-forge::r-optparse # =1.7.1 - conda-forge::r-argparse # =2.1.3 - - bioconda::bioconductor-genomicfeatures=1.46.1 + - bioconda::bioconductor-genomicfeatures # NOTE Can't use the official package because of # https://github.com/dankoc/groHMM/issues/2 # - bioconda::bioconductor-grohmm=1.28.0 - - bioinfoorphanage-grohmm + - bioinfoorphanage-grohmm=1.36.1 diff --git a/modules/local/grohmm/parametertuning/main.nf b/modules/local/grohmm/parametertuning/main.nf index b865f407..d2c2ae77 100644 --- a/modules/local/grohmm/parametertuning/main.nf +++ b/modules/local/grohmm/parametertuning/main.nf @@ -5,8 +5,8 @@ process GROHMM_PARAMETERTUNING { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'oras://community.wave.seqera.io/library/grohmm:a660d9c3942c9b85' : - 'community.wave.seqera.io/library/grohmm:780b8693bdaa87b9' }" + 'oras://community.wave.seqera.io/library/grohmm:03357458e0821bcb' : + 'community.wave.seqera.io/library/grohmm:833aa94cad4202ac' }" input: tuple val(meta), path(bams), path(bais), val(UTS), val(LtProbB) diff --git a/modules/local/grohmm/transcriptcalling/environment.yml b/modules/local/grohmm/transcriptcalling/environment.yml index e9f7efa5..98febd37 100644 --- a/modules/local/grohmm/transcriptcalling/environment.yml +++ b/modules/local/grohmm/transcriptcalling/environment.yml @@ -11,4 +11,4 @@ dependencies: # NOTE Can't use the official package because of # https://github.com/dankoc/groHMM/issues/2 # - bioconda::bioconductor-grohmm=1.28.0 - - bioinfoorphanage-grohmm + - bioinfoorphanage-grohmm=1.36.1 diff --git a/modules/local/grohmm/transcriptcalling/main.nf b/modules/local/grohmm/transcriptcalling/main.nf index 8a6d35cd..83105e4b 100644 --- a/modules/local/grohmm/transcriptcalling/main.nf +++ b/modules/local/grohmm/transcriptcalling/main.nf @@ -5,8 +5,8 @@ process GROHMM_TRANSCRIPTCALLING { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'oras://community.wave.seqera.io/library/grohmm:a660d9c3942c9b85' : - 'community.wave.seqera.io/library/grohmm:780b8693bdaa87b9' }" + 'oras://community.wave.seqera.io/library/grohmm:03357458e0821bcb' : + 'community.wave.seqera.io/library/grohmm:833aa94cad4202ac' }" input: tuple val(meta), path(bams), path(bais), path(tuning_file) diff --git a/workflows/tests/transcript_indentification/grohmm/skip_tuning/main.nf.test b/workflows/tests/transcript_indentification/grohmm/skip_tuning/main.nf.test index 4885b0b7..f8cb4aa2 100644 --- a/workflows/tests/transcript_indentification/grohmm/skip_tuning/main.nf.test +++ b/workflows/tests/transcript_indentification/grohmm/skip_tuning/main.nf.test @@ -4,7 +4,6 @@ nextflow_pipeline { script "../../../main.nf" tag "groHMM" - // TODO Support a tuning file test("Should be able to skip tuning") { when { @@ -17,33 +16,8 @@ nextflow_pipeline { then { assertAll( + // TODO Support a tuning file { assert workflow.failure }, - { assert snapshot(UTILS.removeNextflowVersion("$outputDir/pipeline_info/nf_core_pipeline_software_mqc_versions.yml")).match("software_versions") }, - { assert snapshot( - workflow.trace.tasks().size(), - path("$outputDir/transcript_identification/homer/cd4.bed"), - path("$outputDir/transcript_identification/homer/jurkat.bed"), - // FIXME Not determinstic because of the order of files - // Add to the other tests when fixed - // UTILS.getAllFilesFromDir("$outputDir/transcript_identification/pints/", ".bed"), - path("$outputDir/transcript_identification/intersect/").list(), - path("$outputDir/transcript_identification/filtered/").list(), - path("$outputDir/transcript_identification/grohmm/cd4.eval.txt"), - path("$outputDir/transcript_identification/grohmm/cd4.final.transcripts.bed"), - path("$outputDir/transcript_identification/grohmm/cd4.tdFinal.txt"), - path("$outputDir/transcript_identification/grohmm/cd4.tdplot_mqc.jpg").exists(), - path("$outputDir/transcript_identification/grohmm/cd4.transcripts.txt"), - path("$outputDir/transcript_identification/grohmm/jurkat.eval.txt"), - path("$outputDir/transcript_identification/grohmm/jurkat.final.transcripts.bed"), - path("$outputDir/transcript_identification/grohmm/jurkat.tdFinal.txt"), - path("$outputDir/transcript_identification/grohmm/jurkat.tdplot_mqc.jpg").exists(), - path("$outputDir/transcript_identification/grohmm/jurkat.transcripts.txt"), - // FIXME Not determinstic because of the order of files - // Add to the other tests when fixed - // path("$outputDir/quantification/").list(), - path("$outputDir/multiqc/multiqc_report.html").exists(), - ).match("output_files") - } ) } } From 9388a21538ec392d7e172cb0ee8b544cc7557110 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Thu, 17 Oct 2024 10:20:48 -0500 Subject: [PATCH 29/54] Add consensus bed as output for testing --- bin/parameter_tuning.R | 2 ++ modules/local/grohmm/parametertuning/main.nf | 1 + .../local/grohmm/parametertuning/tests/main.nf.test | 3 ++- .../grohmm/parametertuning/tests/main.nf.test.snap | 13 +++++-------- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/bin/parameter_tuning.R b/bin/parameter_tuning.R index 791e3e0e..42d613e6 100755 --- a/bin/parameter_tuning.R +++ b/bin/parameter_tuning.R @@ -148,6 +148,8 @@ print(e) # Write the combined data to a CSV file without row names write.csv(tune, file = paste0(args$outprefix, ".tuning.csv"), row.names = FALSE) +# Write kg_consensus to a bed file for testing +export.bed(kg_consensus, con = paste0(args$outprefix, ".tuning.consensus.bed")) ######################## ## CITE PACKAGES USED ## diff --git a/modules/local/grohmm/parametertuning/main.nf b/modules/local/grohmm/parametertuning/main.nf index d2c2ae77..c682f912 100644 --- a/modules/local/grohmm/parametertuning/main.nf +++ b/modules/local/grohmm/parametertuning/main.nf @@ -14,6 +14,7 @@ process GROHMM_PARAMETERTUNING { output: tuple val(meta), path("*.tuning.csv"), emit: tuning + tuple val(meta), path("*.tuning.consensus.bed"), emit: bed path "versions.yml", emit: versions when: diff --git a/modules/local/grohmm/parametertuning/tests/main.nf.test b/modules/local/grohmm/parametertuning/tests/main.nf.test index 022cb455..c3b7117b 100644 --- a/modules/local/grohmm/parametertuning/tests/main.nf.test +++ b/modules/local/grohmm/parametertuning/tests/main.nf.test @@ -31,7 +31,8 @@ nextflow_process { assertAll( { assert process.success }, { assert snapshot( - process.out.tuning, + path(process.out.tuning[0][1]).readLines(), + path(process.out.bed[0][1]), ).match() }, { assert snapshot(path(process.out.versions.get(0)).yaml).match("versions") }, ) diff --git a/modules/local/grohmm/parametertuning/tests/main.nf.test.snap b/modules/local/grohmm/parametertuning/tests/main.nf.test.snap index 59516a3f..dce05f0a 100644 --- a/modules/local/grohmm/parametertuning/tests/main.nf.test.snap +++ b/modules/local/grohmm/parametertuning/tests/main.nf.test.snap @@ -17,18 +17,15 @@ "Should run with defaults": { "content": [ [ - [ - { - "id": "Sall" - }, - "Sall_5_-100.tuning.csv:md5,abe7dd590cb307eff7abb293eec38190" - ] - ] + "\"LtProbB\",\"UTS\",\"merged\",\"dissociated\",\"total\",\"errorRate\",\"txSize\"", + "-100,5,33,123,156,0.0756180319922443,921" + ], + "Sall_5_-100.tuning.consensus.bed:md5,f3144b4ff6493a7ccef422aa4e160d46" ], "meta": { "nf-test": "0.9.0", "nextflow": "24.04.4" }, - "timestamp": "2024-09-24T11:44:17.28827251" + "timestamp": "2024-10-17T19:58:42.34659" } } \ No newline at end of file From b7567c8b45d3c4cbbc686ec17eafe271d5ffa50e Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Wed, 25 Sep 2024 16:30:46 -0500 Subject: [PATCH 30/54] test(grohmm): Update all the snapshots --- .../local/grohmm/parametertuning/tests/main.nf.test | 1 + .../grohmm/transcriptcalling/tests/main.nf.test | 1 + .../grohmm/skip_tuning/main.nf.test | 4 ++-- .../grohmm/tuning/main.nf.test | 2 +- .../grohmm/tuning/main.nf.test.snap | 12 ++++++++++++ 5 files changed, 17 insertions(+), 3 deletions(-) create mode 100644 workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test.snap diff --git a/modules/local/grohmm/parametertuning/tests/main.nf.test b/modules/local/grohmm/parametertuning/tests/main.nf.test index c3b7117b..6e07ec48 100644 --- a/modules/local/grohmm/parametertuning/tests/main.nf.test +++ b/modules/local/grohmm/parametertuning/tests/main.nf.test @@ -3,6 +3,7 @@ nextflow_process { name "Test Process GROHMM_PARAMETERTUNING" script "modules/local/grohmm/parametertuning/main.nf" process "GROHMM_PARAMETERTUNING" + tag "grohmm" test("Should run with defaults") { when { diff --git a/modules/local/grohmm/transcriptcalling/tests/main.nf.test b/modules/local/grohmm/transcriptcalling/tests/main.nf.test index 0395a6fc..01a12c62 100644 --- a/modules/local/grohmm/transcriptcalling/tests/main.nf.test +++ b/modules/local/grohmm/transcriptcalling/tests/main.nf.test @@ -3,6 +3,7 @@ nextflow_process { name "Test Process GROHMM_TRANSCRIPTCALLING" script "modules/local/grohmm/transcriptcalling/main.nf" process "GROHMM_TRANSCRIPTCALLING" + tag "groHMM" test("Should fail without a tuning file") { when { diff --git a/workflows/tests/transcript_indentification/grohmm/skip_tuning/main.nf.test b/workflows/tests/transcript_indentification/grohmm/skip_tuning/main.nf.test index f8cb4aa2..5956948f 100644 --- a/workflows/tests/transcript_indentification/grohmm/skip_tuning/main.nf.test +++ b/workflows/tests/transcript_indentification/grohmm/skip_tuning/main.nf.test @@ -1,7 +1,7 @@ nextflow_pipeline { name "groHMM" - script "../../../main.nf" + script "../../../../../main.nf" tag "groHMM" test("Should be able to skip tuning") { @@ -17,7 +17,7 @@ nextflow_pipeline { then { assertAll( // TODO Support a tuning file - { assert workflow.failure }, + { assert workflow.failed }, ) } } diff --git a/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test b/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test index 09053601..ea23dc88 100644 --- a/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test +++ b/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test @@ -1,6 +1,6 @@ nextflow_pipeline { name "groHMM" - script "../../../main.nf" + script "../../../../../main.nf" tag "groHMM" test("Should run with defaults") { diff --git a/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test.snap b/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test.snap new file mode 100644 index 00000000..e1c67f43 --- /dev/null +++ b/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test.snap @@ -0,0 +1,12 @@ +{ + "software_versions": { + "content": [ + "{BBMAP_PILEUP={bbmap=39.01, samtools=1.16.1, pigz=2.6}, BEDTOOLS_GENOMECOV_MINUS={bedtools=2.31.1}, BEDTOOLS_GENOMECOV_PLUS={bedtools=2.31.1}, BWA_INDEX={bwa=0.7.18-r1243-dirty}, BWA_MEM={bwa=0.7.18-r1243-dirty, samtools=1.2}, CUSTOM_GETCHROMSIZES={getchromsizes=1.2}, DEEPTOOLS_BAMCOVERAGE_MINUS={deeptools=3.5.1}, DEEPTOOLS_BAMCOVERAGE_PLUS={deeptools=3.5.1}, FASTP={fastp=0.23.4}, FASTQC={fastqc=0.12.1}, GROHMM_PARAMETERTUNING={r-base=4.3.3, bioconductor-grohmm=1.39.0}, GTF2BED={perl=5.26.2}, HOMER_MAKETAGDIRECTORY={homer=4.11, samtools=1.11}, PINTS_CALLER={python=3.10.6, pints=1.1.8}, PRESEQ_CCURVE={preseq=3.1.1}, PRESEQ_LCEXTRAP={preseq=3.1.1}, RSEQC_INFEREXPERIMENT={rseqc=5.0.2}, RSEQC_READDISTRIBUTION={rseqc=5.0.2}, RSEQC_READDUPLICATION={rseqc=5.0.2}, SUBREAD_FEATURECOUNTS_GENE={subread=2.0.1}, Workflow={nf-core/nascent=v2.3.0dev}}" + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-26T03:52:06.649510139" + } +} \ No newline at end of file From baa008bae6458cc4649f3b4ffc5b7a41b8e18cec Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Thu, 26 Sep 2024 14:33:42 -0500 Subject: [PATCH 31/54] refactor(grohmm): Use each input Didn't know that was a thing! --- modules/local/grohmm/parametertuning/main.nf | 4 +++- subworkflows/local/grohmm/main.nf | 8 +++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/local/grohmm/parametertuning/main.nf b/modules/local/grohmm/parametertuning/main.nf index c682f912..56cb707e 100644 --- a/modules/local/grohmm/parametertuning/main.nf +++ b/modules/local/grohmm/parametertuning/main.nf @@ -9,8 +9,10 @@ process GROHMM_PARAMETERTUNING { 'community.wave.seqera.io/library/grohmm:833aa94cad4202ac' }" input: - tuple val(meta), path(bams), path(bais), val(UTS), val(LtProbB) + tuple val(meta), path(bams), path(bais) path gtf + each UTS + each LtProbB output: tuple val(meta), path("*.tuning.csv"), emit: tuning diff --git a/subworkflows/local/grohmm/main.nf b/subworkflows/local/grohmm/main.nf index bec93af9..260e737d 100644 --- a/subworkflows/local/grohmm/main.nf +++ b/subworkflows/local/grohmm/main.nf @@ -25,14 +25,12 @@ workflow GROHMM { ch_uts = channel.fromList((params.grohmm_min_uts..params.grohmm_max_uts).step(5)) // -100..-400 by 50 for LtProbB ch_ltprobb = channel.fromList((params.grohmm_min_ltprobb..params.grohmm_max_ltprobb).step(50)) - ch_bams_bais_uts_ltprobb = - bams_bais - .combine(ch_uts) - .combine(ch_ltprobb) GROHMM_PARAMETERTUNING ( - ch_bams_bais_uts_ltprobb, + bams_bais, gtf, + ch_uts, + ch_ltprobb, ) .tuning .collectFile( From d9e94aa986cc37bc2f3c1d56052973a96a9312fa Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Fri, 27 Sep 2024 08:50:34 -0500 Subject: [PATCH 32/54] feat(grohmm): Add Native MultiQC support --- bin/transcriptcalling_grohmm.R | 8 +++++--- modules/local/grohmm/parametertuning/environment.yml | 2 +- modules/local/grohmm/transcriptcalling/environment.yml | 2 +- modules/local/grohmm/transcriptcalling/main.nf | 1 + 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/bin/transcriptcalling_grohmm.R b/bin/transcriptcalling_grohmm.R index 5600ca54..e19cd244 100755 --- a/bin/transcriptcalling_grohmm.R +++ b/bin/transcriptcalling_grohmm.R @@ -181,19 +181,21 @@ b_plus <- breakTranscriptsOnGenes(tx_hmm, kg_consensus, strand = "+") b_minus <- breakTranscriptsOnGenes(tx_hmm, kg_consensus, strand = "-") tx_broken <- c(b_plus, b_minus) tx_final <- combineTranscripts(tx_broken, kg_consensus) -td_final <- getTxDensity(tx_final, con_expressed, mc.cores = args$cores) export( tx_final, con = paste(args$outprefix, ".final.transcripts.bed", sep = "") ) -capture.output(td_final, file = paste0(args$outprefix, ".tdFinal.txt")) # 1. Output plot jpeg(file = paste0(args$outprefix, ".tdplot_mqc.jpg")) -# 2. Create the plot +# 2. Create the plot and capture data td_final <- getTxDensity(tx_final, con_expressed, mc.cores = args$cores) # 3. Close the file dev.off() +capture.output(td_final, file = paste0(args$outprefix, ".tdFinal.txt")) +# Write the data used in the plot to a CSV file +data_to_write <- data.frame(x = td_final$x, profile = td_final$profile) +write.csv(data_to_write, file = paste0(args$outprefix, ".tdFinal_mqc.csv"), row.names = FALSE) ######################## ## CITE PACKAGES USED ## diff --git a/modules/local/grohmm/parametertuning/environment.yml b/modules/local/grohmm/parametertuning/environment.yml index 5972f503..9541fd45 100644 --- a/modules/local/grohmm/parametertuning/environment.yml +++ b/modules/local/grohmm/parametertuning/environment.yml @@ -11,4 +11,4 @@ dependencies: # NOTE Can't use the official package because of # https://github.com/dankoc/groHMM/issues/2 # - bioconda::bioconductor-grohmm=1.28.0 - - bioinfoorphanage-grohmm=1.36.1 + - bioinfoorphanage-grohmm=1.37.0 diff --git a/modules/local/grohmm/transcriptcalling/environment.yml b/modules/local/grohmm/transcriptcalling/environment.yml index 98febd37..7c33e90d 100644 --- a/modules/local/grohmm/transcriptcalling/environment.yml +++ b/modules/local/grohmm/transcriptcalling/environment.yml @@ -11,4 +11,4 @@ dependencies: # NOTE Can't use the official package because of # https://github.com/dankoc/groHMM/issues/2 # - bioconda::bioconductor-grohmm=1.28.0 - - bioinfoorphanage-grohmm=1.36.1 + - bioinfoorphanage-grohmm=1.37.0 diff --git a/modules/local/grohmm/transcriptcalling/main.nf b/modules/local/grohmm/transcriptcalling/main.nf index 83105e4b..2b920842 100644 --- a/modules/local/grohmm/transcriptcalling/main.nf +++ b/modules/local/grohmm/transcriptcalling/main.nf @@ -18,6 +18,7 @@ process GROHMM_TRANSCRIPTCALLING { tuple val(meta), path("*.transcripts.bed"), emit: transcripts_bed tuple val(meta), path("*.tdFinal.txt") , emit: td tuple val(meta), path("*.tdplot_mqc.jpg") , emit: td_plot + tuple val(meta), path("*.tdFinal_mqc.csv") , emit: mqc_csv path "versions.yml" , emit: versions when: From a535c8c78ce9781f3ef62ac67e5a9ad6b2f1980f Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Sat, 28 Sep 2024 11:33:42 -0500 Subject: [PATCH 33/54] fix(grohmm): Update labels for parametertuning Since these are are split into seperate jobs now, they're not "long"(longest I've seen one take is ~143 minutes). Adding arrays later because there's so many of them and they're the same --- modules/local/grohmm/parametertuning/main.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/local/grohmm/parametertuning/main.nf b/modules/local/grohmm/parametertuning/main.nf index 56cb707e..0f11a54e 100644 --- a/modules/local/grohmm/parametertuning/main.nf +++ b/modules/local/grohmm/parametertuning/main.nf @@ -1,7 +1,8 @@ process GROHMM_PARAMETERTUNING { tag "$meta.id|$UTS|$LtProbB" label 'process_high' - label 'process_long' + label 'error_retry' + // array 10 conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? From 2f1c81c401c89a09860ee6bd1144e91689f0fa86 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Mon, 30 Sep 2024 11:15:11 -0500 Subject: [PATCH 34/54] chore: Add a note about the subworkflow functionality --- subworkflows/local/grohmm/tests/main.nf.test | 2 ++ 1 file changed, 2 insertions(+) diff --git a/subworkflows/local/grohmm/tests/main.nf.test b/subworkflows/local/grohmm/tests/main.nf.test index 905fbf5d..29382e3f 100644 --- a/subworkflows/local/grohmm/tests/main.nf.test +++ b/subworkflows/local/grohmm/tests/main.nf.test @@ -3,6 +3,7 @@ nextflow_workflow { name "Test subworkflow GROHMM" script "../main.nf" workflow "GROHMM" + tag "grohmm" test("Should run without a tuning file and multiplex") { when { @@ -32,6 +33,7 @@ nextflow_workflow { then { assertAll( { assert workflow.success }, + // FIXME this snapshot reports nothing? Probably the test { assert snapshot( workflow.trace.tasks().size(), workflow.out.transcripts, From cd9151e4241518f1ce944927cabc067473744be2 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Sun, 6 Oct 2024 18:48:09 -0500 Subject: [PATCH 35/54] chore: oras => https --- modules/local/grohmm/parametertuning/main.nf | 2 +- modules/local/grohmm/transcriptcalling/main.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/grohmm/parametertuning/main.nf b/modules/local/grohmm/parametertuning/main.nf index 0f11a54e..ffa13d70 100644 --- a/modules/local/grohmm/parametertuning/main.nf +++ b/modules/local/grohmm/parametertuning/main.nf @@ -6,7 +6,7 @@ process GROHMM_PARAMETERTUNING { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'oras://community.wave.seqera.io/library/grohmm:03357458e0821bcb' : + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/b9/b929af5662486ba6ce2d27eb501e5c7ec71ca7dd8e333fe5d3dcf2803d87cf67/data' : 'community.wave.seqera.io/library/grohmm:833aa94cad4202ac' }" input: diff --git a/modules/local/grohmm/transcriptcalling/main.nf b/modules/local/grohmm/transcriptcalling/main.nf index 2b920842..dfb1a731 100644 --- a/modules/local/grohmm/transcriptcalling/main.nf +++ b/modules/local/grohmm/transcriptcalling/main.nf @@ -5,7 +5,7 @@ process GROHMM_TRANSCRIPTCALLING { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'oras://community.wave.seqera.io/library/grohmm:03357458e0821bcb' : + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/b9/b929af5662486ba6ce2d27eb501e5c7ec71ca7dd8e333fe5d3dcf2803d87cf67/data' : 'community.wave.seqera.io/library/grohmm:833aa94cad4202ac' }" input: From 26626fa07ba52e9719d547c318abbbc14e80c0f7 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Thu, 26 Sep 2024 14:33:42 -0500 Subject: [PATCH 36/54] refactor(grohmm): Use each input Didn't know that was a thing! --- modules/local/grohmm/parametertuning/tests/main.nf.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/grohmm/parametertuning/tests/main.nf.test b/modules/local/grohmm/parametertuning/tests/main.nf.test index 6e07ec48..eafdb9d9 100644 --- a/modules/local/grohmm/parametertuning/tests/main.nf.test +++ b/modules/local/grohmm/parametertuning/tests/main.nf.test @@ -17,13 +17,13 @@ nextflow_process { [ file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S0mR1.bam", checkIfExists: true), file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S40mR1.bam", checkIfExists: true) ], [], - 5, - -100, ] input[1] = file( "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf", checkIfExists: true ) + input[2] = 5 + input[3] = -100 """ } } From 9272060f1cf50e971768713ebb3ab0ac4e6deda1 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Mon, 7 Oct 2024 16:06:19 -0500 Subject: [PATCH 37/54] test(grohmm): Write a failing test With the help of cursor --- .../grohmm/parametertuning/tests/broken.gtf | 13 +++++++ .../parametertuning/tests/chm13_gtf.nf.test | 37 +++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 modules/local/grohmm/parametertuning/tests/broken.gtf create mode 100644 modules/local/grohmm/parametertuning/tests/chm13_gtf.nf.test diff --git a/modules/local/grohmm/parametertuning/tests/broken.gtf b/modules/local/grohmm/parametertuning/tests/broken.gtf new file mode 100644 index 00000000..a62cff54 --- /dev/null +++ b/modules/local/grohmm/parametertuning/tests/broken.gtf @@ -0,0 +1,13 @@ +chr1 source gene 1000 5000 . + . gene_id "gene1"; +chr1 source transcript 1000 5000 . + . gene_id "gene1"; transcript_id "transcript1"; +chr1 source exon 1000 2000 . + . gene_id "gene1"; transcript_id "transcript1"; exon_number "1"; +chr1 source exon 1500 2500 . + . gene_id "gene1"; transcript_id "transcript1"; exon_number "2"; +chr1 source CDS 1000 1500 . + 0 gene_id "gene1"; transcript_id "transcript1"; exon_number "1"; +chr1 source CDS 1500 2000 . + 0 gene_id "gene1"; transcript_id "transcript1"; exon_number "2"; + +chr1 source gene 6000 9000 . + . gene_id "gene2"; +chr1 source transcript 6000 9000 . + . gene_id "gene2"; transcript_id "transcript2"; +chr1 source exon 6000 7000 . + . gene_id "gene2"; transcript_id "transcript2"; exon_number "1"; +chr1 source exon 6500 8000 . + . gene_id "gene2"; transcript_id "transcript2"; exon_number "2"; +chr1 source CDS 6000 6500 . + 0 gene_id "gene2"; transcript_id "transcript2"; exon_number "1"; +chr1 source CDS 6500 7000 . + 0 gene_id "gene2"; transcript_id "transcript2"; exon_number "2"; diff --git a/modules/local/grohmm/parametertuning/tests/chm13_gtf.nf.test b/modules/local/grohmm/parametertuning/tests/chm13_gtf.nf.test new file mode 100644 index 00000000..e0d37e82 --- /dev/null +++ b/modules/local/grohmm/parametertuning/tests/chm13_gtf.nf.test @@ -0,0 +1,37 @@ +nextflow_process { + + name "Test Process GROHMM_PARAMETERTUNING" + script "../main.nf" + process "GROHMM_PARAMETERTUNING" + tag "groHMM" + + test("Should fail with overlapping exons in GTF") { + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id: 'Sall' ], + [ file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S0mR1.bam", checkIfExists: true), + file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S40mR1.bam", checkIfExists: true) ], + [], + ] + input[1] = file( + "https://gist.githubusercontent.com/edmundmiller/f9a31e300a90956d8aaff7ad6105e394/raw/99f6eff1ddb8ca9ac1cd766ea2fed9bb83919fb2/broke.gtf", + checkIfExists: true + ) + input[2] = 5 + input[3] = -100 + """ + } + } + + then { + assertAll( + { assert process.failed }, + ) + } + } +} From 3d5cf853c9a206ff5248df74e6078bdc46aa814e Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Fri, 4 Oct 2024 11:20:09 -0500 Subject: [PATCH 38/54] fix(grohmm): Update to work with CHM13 refactor: Clean up code more --- bin/parameter_tuning.R | 59 ++++++++++++++++++- main.nf | 2 +- .../parametertuning/tests/chm13_gtf.nf.test | 8 ++- .../tests/chm13_gtf.nf.test.snap | 16 +++++ .../parametertuning/tests/main.nf.test.snap | 6 +- .../grohmm/tuning/main.nf.test | 1 + 6 files changed, 83 insertions(+), 9 deletions(-) create mode 100644 modules/local/grohmm/parametertuning/tests/chm13_gtf.nf.test.snap diff --git a/bin/parameter_tuning.R b/bin/parameter_tuning.R index 42d613e6..73c7e5f2 100755 --- a/bin/parameter_tuning.R +++ b/bin/parameter_tuning.R @@ -103,11 +103,64 @@ for (bam in args$bam_files) { } print("Input transcript annotations") -kg_db <- makeTxDbFromGFF(args$gtf) -kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name")) +# Import the GTF file using rtracklayer +gtf <- import(args$gtf) + +# Exclude any transcripts located on chromosomes labeled with "random" +gtf <- gtf[!grepl("random", seqnames(gtf)), ] + +# Extract transcript-level features +transcripts_gtf <- gtf[gtf$type == "transcript", ] +# Extract exon features +exons_gtf <- gtf[gtf$type == "exon", ] + +# Ensure that the 'transcript_id' and 'gene_id' columns are present +if (!all(c("transcript_id", "gene_id") %in% names(mcols(exons_gtf)))) { + stop("The GTF file lacks 'transcript_id' or 'gene_id' in its attributes.") +} + +# Group exons by transcript_id +exons_by_transcript <- split(exons_gtf, exons_gtf$transcript_id) + +# Diagnostic prints +print(paste("Number of transcripts:", length(exons_by_transcript))) + +# Reduce exons to create transcript ranges +transcripts_ranges <- GenomicRanges::reduce(exons_by_transcript) +transcripts_ranges <- unlist(transcripts_ranges, use.names = TRUE) + +# Diagnostic prints after reduction +print(paste("Number of transcripts_ranges after reduction:", length(transcripts_ranges))) + +# Create mapping dataframe +mapping_df <- data.frame( + transcript_id = names(transcripts_ranges), + gene_id = vapply(exons_by_transcript[names(transcripts_ranges)], function(x) unique(x$gene_id)[1], character(1)), + stringsAsFactors = FALSE +) + +# Check for length mismatch +if (nrow(mapping_df) != length(transcripts_ranges)) { + stop(paste("Length mismatch between mapping_df and transcripts_ranges:", nrow(mapping_df), length(transcripts_ranges))) +} + +# Assign metadata +mcols(transcripts_ranges)$transcript_id <- mapping_df$transcript_id +mcols(transcripts_ranges)$gene_id <- mapping_df$gene_id + +# Assign seqnames and strand from the exons +seqnames(transcripts_ranges) <- seqnames(exons_gtf[match(names(transcripts_ranges), exons_gtf$transcript_id)]) +strand(transcripts_ranges) <- strand(exons_gtf[match(names(transcripts_ranges), exons_gtf$transcript_id)]) + +# Ensure that seqlevels are set correctly +seqlevels(transcripts_ranges) <- seqlevels(gtf) + +# Remove any transcripts with NA values +transcripts_ranges <- transcripts_ranges[!is.na(start(transcripts_ranges)) & !is.na(end(transcripts_ranges))] + print("Collapse annotations in preparation for overlap") kg_consensus <- makeConsensusAnnotations( - kg_tx, + transcripts_ranges, mc.cores = args$cores ) print("Finished consensus annotations") diff --git a/main.nf b/main.nf index 53cc81c9..a7c8547b 100644 --- a/main.nf +++ b/main.nf @@ -15,7 +15,7 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { NASCENT } from './workflows/nascent' +include { NASCENT } from './workflows/nascent' include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_nascent_pipeline' include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_nascent_pipeline' include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_nascent_pipeline' diff --git a/modules/local/grohmm/parametertuning/tests/chm13_gtf.nf.test b/modules/local/grohmm/parametertuning/tests/chm13_gtf.nf.test index e0d37e82..0d27d847 100644 --- a/modules/local/grohmm/parametertuning/tests/chm13_gtf.nf.test +++ b/modules/local/grohmm/parametertuning/tests/chm13_gtf.nf.test @@ -5,7 +5,7 @@ nextflow_process { process "GROHMM_PARAMETERTUNING" tag "groHMM" - test("Should fail with overlapping exons in GTF") { + test("Should pass with overlapping exons in GTF") { when { params { outdir = "$outputDir" @@ -30,7 +30,11 @@ nextflow_process { then { assertAll( - { assert process.failed }, + { assert process.success }, + { assert snapshot( + path(process.out.tuning[0][1]).readLines(), + path(process.out.bed[0][1]), + ).match() }, ) } } diff --git a/modules/local/grohmm/parametertuning/tests/chm13_gtf.nf.test.snap b/modules/local/grohmm/parametertuning/tests/chm13_gtf.nf.test.snap new file mode 100644 index 00000000..a7e892f2 --- /dev/null +++ b/modules/local/grohmm/parametertuning/tests/chm13_gtf.nf.test.snap @@ -0,0 +1,16 @@ +{ + "Should pass with overlapping exons in GTF": { + "content": [ + [ + "\"LtProbB\",\"UTS\",\"merged\",\"dissociated\",\"total\",\"errorRate\",\"txSize\"", + "-100,5,0,0,0,0,921" + ], + "Sall_5_-100.tuning.consensus.bed:md5,90bcbcd2a2fbd8e2c3602152d89d1cd5" + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-17T20:16:00.717011" + } +} \ No newline at end of file diff --git a/modules/local/grohmm/parametertuning/tests/main.nf.test.snap b/modules/local/grohmm/parametertuning/tests/main.nf.test.snap index dce05f0a..f68dce3f 100644 --- a/modules/local/grohmm/parametertuning/tests/main.nf.test.snap +++ b/modules/local/grohmm/parametertuning/tests/main.nf.test.snap @@ -18,14 +18,14 @@ "content": [ [ "\"LtProbB\",\"UTS\",\"merged\",\"dissociated\",\"total\",\"errorRate\",\"txSize\"", - "-100,5,33,123,156,0.0756180319922443,921" + "-100,5,10,0,10,0.00617665225447807,921" ], - "Sall_5_-100.tuning.consensus.bed:md5,f3144b4ff6493a7ccef422aa4e160d46" + "Sall_5_-100.tuning.consensus.bed:md5,013b67a2aef7956b6400a61d413f65a8" ], "meta": { "nf-test": "0.9.0", "nextflow": "24.04.4" }, - "timestamp": "2024-10-17T19:58:42.34659" + "timestamp": "2024-10-17T20:21:05.448531" } } \ No newline at end of file diff --git a/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test b/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test index ea23dc88..921079cc 100644 --- a/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test +++ b/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test @@ -2,6 +2,7 @@ nextflow_pipeline { name "groHMM" script "../../../../../main.nf" tag "groHMM" + // triggers 'bin/parameter_tuning.R', 'bin/transcriptcalling_grohmm.R' test("Should run with defaults") { when { From b8548a3def136ce6fbb6e8d9a561285535c5553b Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Fri, 4 Oct 2024 18:37:20 -0500 Subject: [PATCH 39/54] refactor: Replace jpg with png --- assets/multiqc_config.yml | 2 +- bin/transcriptcalling_grohmm.R | 7 ++++++- docs/output.md | 2 +- modules/local/grohmm/transcriptcalling/main.nf | 2 +- .../transcript_indentification/grohmm/tuning/main.nf.test | 4 ++-- 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index c2263ca3..60f73877 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -50,7 +50,7 @@ custom_data: plot_type: "image" sp: grohmm_plot: - fn: "*.tdplot_mqc.jpg" + fn: "*.tdplot_mqc.png" ignore_images: false export_plots: true diff --git a/bin/transcriptcalling_grohmm.R b/bin/transcriptcalling_grohmm.R index e19cd244..883daa18 100755 --- a/bin/transcriptcalling_grohmm.R +++ b/bin/transcriptcalling_grohmm.R @@ -186,7 +186,12 @@ export( con = paste(args$outprefix, ".final.transcripts.bed", sep = "") ) # 1. Output plot -jpeg(file = paste0(args$outprefix, ".tdplot_mqc.jpg")) +png( + file = paste0(args$outprefix, ".tdplot_mqc.png"), + width = 800, + height = 600, res = 300 +) + # 2. Create the plot and capture data td_final <- getTxDensity(tx_final, con_expressed, mc.cores = args$cores) # 3. Close the file diff --git a/docs/output.md b/docs/output.md index 4b251dbe..22715b8a 100644 --- a/docs/output.md +++ b/docs/output.md @@ -326,7 +326,7 @@ For more information about how PINTS works, see the paper [A comparison of exper - `*.eval.txt`: Evaluation of HMM Annotations - `*.final.transcripts.bed`: Predicted transcripts - `*.tdFinal.txt`: Final quality metrics - - `*.tdplot_mqc.jpg`: TD plot included in MultiQC + - `*.tdplot_mqc.png`: TD plot included in MultiQC - `*.transcripts.txt`: Predicted transcripts in txt form - `*.tuning.csv`: The tuning csv that was used diff --git a/modules/local/grohmm/transcriptcalling/main.nf b/modules/local/grohmm/transcriptcalling/main.nf index dfb1a731..7935daae 100644 --- a/modules/local/grohmm/transcriptcalling/main.nf +++ b/modules/local/grohmm/transcriptcalling/main.nf @@ -17,7 +17,7 @@ process GROHMM_TRANSCRIPTCALLING { tuple val(meta), path("*.eval.txt") , emit: eval tuple val(meta), path("*.transcripts.bed"), emit: transcripts_bed tuple val(meta), path("*.tdFinal.txt") , emit: td - tuple val(meta), path("*.tdplot_mqc.jpg") , emit: td_plot + tuple val(meta), path("*.tdplot_mqc.png") , emit: td_plot tuple val(meta), path("*.tdFinal_mqc.csv") , emit: mqc_csv path "versions.yml" , emit: versions diff --git a/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test b/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test index 921079cc..90c84a15 100644 --- a/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test +++ b/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test @@ -30,12 +30,12 @@ nextflow_pipeline { path("$outputDir/transcript_identification/grohmm/cd4.eval.txt"), path("$outputDir/transcript_identification/grohmm/cd4.final.transcripts.bed"), path("$outputDir/transcript_identification/grohmm/cd4.tdFinal.txt"), - path("$outputDir/transcript_identification/grohmm/cd4.tdplot_mqc.jpg").exists(), + path("$outputDir/transcript_identification/grohmm/cd4.tdplot_mqc.png").exists(), path("$outputDir/transcript_identification/grohmm/cd4.transcripts.txt"), path("$outputDir/transcript_identification/grohmm/jurkat.eval.txt"), path("$outputDir/transcript_identification/grohmm/jurkat.final.transcripts.bed"), path("$outputDir/transcript_identification/grohmm/jurkat.tdFinal.txt"), - path("$outputDir/transcript_identification/grohmm/jurkat.tdplot_mqc.jpg").exists(), + path("$outputDir/transcript_identification/grohmm/jurkat.tdplot_mqc.png").exists(), path("$outputDir/transcript_identification/grohmm/jurkat.transcripts.txt"), // FIXME Not determinstic because of the order of files // Add to the other tests when fixed From 0c7a9ba0d83045e23571f84ca9e1478f12ea7cf6 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Wed, 9 Oct 2024 16:30:30 -0500 Subject: [PATCH 40/54] test: Remove skip_tuning test --- .../grohmm/skip_tuning/main.nf.test | 24 ------------------- 1 file changed, 24 deletions(-) delete mode 100644 workflows/tests/transcript_indentification/grohmm/skip_tuning/main.nf.test diff --git a/workflows/tests/transcript_indentification/grohmm/skip_tuning/main.nf.test b/workflows/tests/transcript_indentification/grohmm/skip_tuning/main.nf.test deleted file mode 100644 index 5956948f..00000000 --- a/workflows/tests/transcript_indentification/grohmm/skip_tuning/main.nf.test +++ /dev/null @@ -1,24 +0,0 @@ -nextflow_pipeline { - - name "groHMM" - script "../../../../../main.nf" - tag "groHMM" - - test("Should be able to skip tuning") { - - when { - params { - outdir = "$outputDir" - skip_grohmm = false - skip_tuning = true - } - } - - then { - assertAll( - // TODO Support a tuning file - { assert workflow.failed }, - ) - } - } -} From 09587789746789967fc48e40d8875937564ab00d Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Wed, 9 Oct 2024 17:42:02 -0500 Subject: [PATCH 41/54] test: Bump snapshot --- .../grohmm/tuning/main.nf.test.snap | 122 +++++++++++++++++- 1 file changed, 120 insertions(+), 2 deletions(-) diff --git a/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test.snap b/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test.snap index e1c67f43..5796458e 100644 --- a/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test.snap +++ b/workflows/tests/transcript_indentification/grohmm/tuning/main.nf.test.snap @@ -1,12 +1,130 @@ { + "output_files": { + "content": [ + 215, + "cd4.bed:md5,b55e5290d78941f36c3d1ecfef8e0062", + "jurkat.bed:md5,383cfaf10535dbe5d7f47607e345f4cb", + [ + "cd4_intersect.bed:md5,08e9166b4515fa76c7f624a5377d630f", + "jurkat_intersect.bed:md5,3f625c7e363f49f75bcac2d9316af2e0" + ], + [ + "cd4_filtered.bed:md5,bafed399a3f4d5d8363f40616bca4824", + "jurkat_filtered.bed:md5,ee570f28e2f347b66f085357e8ddba57" + ], + [ + "cd4_intersect.bed:md5,08e9166b4515fa76c7f624a5377d630f", + "jurkat_intersect.bed:md5,3f625c7e363f49f75bcac2d9316af2e0" + ], + [ + "cd4_filtered.bed:md5,bafed399a3f4d5d8363f40616bca4824", + "jurkat_filtered.bed:md5,ee570f28e2f347b66f085357e8ddba57" + ], + "cd4.eval.txt:md5,7f49f3b1211f9a2086a3df3b5932b14e", + "cd4.final.transcripts.bed:md5,51089857e6c7c0fa2259c9615e201564", + "cd4.tdFinal.txt:md5,4a40e320646af024b151d199fd442380", + true, + "cd4.transcripts.txt:md5,256b4b93c61506c8470d6c20e7043a85", + "jurkat.eval.txt:md5,7168cf089146b0752d6eb87836e91e34", + "jurkat.final.transcripts.bed:md5,2d560a6af857d69042e1fba09d8533d1", + "jurkat.tdFinal.txt:md5,f65dcec240c6c55f0fb077ed21f17285", + true, + "jurkat.transcripts.txt:md5,a601a4312e953e1adc05e18b7488c3b0", + true + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-09T17:31:48.135947809" + }, "software_versions": { "content": [ - "{BBMAP_PILEUP={bbmap=39.01, samtools=1.16.1, pigz=2.6}, BEDTOOLS_GENOMECOV_MINUS={bedtools=2.31.1}, BEDTOOLS_GENOMECOV_PLUS={bedtools=2.31.1}, BWA_INDEX={bwa=0.7.18-r1243-dirty}, BWA_MEM={bwa=0.7.18-r1243-dirty, samtools=1.2}, CUSTOM_GETCHROMSIZES={getchromsizes=1.2}, DEEPTOOLS_BAMCOVERAGE_MINUS={deeptools=3.5.1}, DEEPTOOLS_BAMCOVERAGE_PLUS={deeptools=3.5.1}, FASTP={fastp=0.23.4}, FASTQC={fastqc=0.12.1}, GROHMM_PARAMETERTUNING={r-base=4.3.3, bioconductor-grohmm=1.39.0}, GTF2BED={perl=5.26.2}, HOMER_MAKETAGDIRECTORY={homer=4.11, samtools=1.11}, PINTS_CALLER={python=3.10.6, pints=1.1.8}, PRESEQ_CCURVE={preseq=3.1.1}, PRESEQ_LCEXTRAP={preseq=3.1.1}, RSEQC_INFEREXPERIMENT={rseqc=5.0.2}, RSEQC_READDISTRIBUTION={rseqc=5.0.2}, RSEQC_READDUPLICATION={rseqc=5.0.2}, SUBREAD_FEATURECOUNTS_GENE={subread=2.0.1}, Workflow={nf-core/nascent=v2.3.0dev}}" + { + "BBMAP_PILEUP": { + "bbmap": 39.01, + "samtools": "1.16.1", + "pigz": 2.6 + }, + "BEDTOOLS_GENOMECOV_MINUS": { + "bedtools": "2.31.1" + }, + "BEDTOOLS_GENOMECOV_PLUS": { + "bedtools": "2.31.1" + }, + "BEDTOOLS_INTERSECT": { + "bedtools": "2.31.1" + }, + "BEDTOOLS_INTERSECT_FILTER": { + "bedtools": "2.31.1" + }, + "BWA_INDEX": { + "bwa": "0.7.18-r1243-dirty" + }, + "BWA_MEM": { + "bwa": "0.7.18-r1243-dirty", + "samtools": 1.2 + }, + "CUSTOM_GETCHROMSIZES": { + "getchromsizes": 1.2 + }, + "DEEPTOOLS_BAMCOVERAGE_MINUS": { + "deeptools": "3.5.1" + }, + "DEEPTOOLS_BAMCOVERAGE_PLUS": { + "deeptools": "3.5.1" + }, + "FASTP": { + "fastp": "0.23.4" + }, + "FASTQC": { + "fastqc": "0.12.1" + }, + "GROHMM_PARAMETERTUNING": { + "r-base": "4.3.3", + "bioconductor-grohmm": "1.39.0" + }, + "GTF2BED": { + "perl": "5.26.2" + }, + "HOMER_MAKETAGDIRECTORY": { + "homer": 4.11, + "samtools": 1.11 + }, + "PINTS_CALLER": { + "python": "3.10.6", + "pints": "1.1.8" + }, + "PRESEQ_CCURVE": { + "preseq": "3.1.1" + }, + "PRESEQ_LCEXTRAP": { + "preseq": "3.1.1" + }, + "RSEQC_INFEREXPERIMENT": { + "rseqc": "5.0.2" + }, + "RSEQC_READDISTRIBUTION": { + "rseqc": "5.0.2" + }, + "RSEQC_READDUPLICATION": { + "rseqc": "5.0.2" + }, + "SUBREAD_FEATURECOUNTS_GENE": { + "subread": "2.0.1" + }, + "SUBREAD_FEATURECOUNTS_PREDICTED": { + "subread": "2.0.1" + }, + "Workflow": { + "nf-core/nascent": "v2.3.0dev" + } + } ], "meta": { "nf-test": "0.9.0", "nextflow": "24.04.4" }, - "timestamp": "2024-09-26T03:52:06.649510139" + "timestamp": "2024-10-09T17:31:48.111668358" } } \ No newline at end of file From d17d9df0e880da23bc56f5d7940cd7bfc2bbf0df Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Tue, 15 Oct 2024 08:38:10 -0500 Subject: [PATCH 42/54] Update CHANGELOG --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d09edad9..90ba7f78 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#137](https://github.com/nf-core/nascent/pull/137) - Use singularity containers for PINTS - [#142](https://github.com/nf-core/nascent/pull/142) - Updated CHM13 references - [#171](https://github.com/nf-core/nascent/pull/171) - Use assertAll in tests +- [#165](https://github.com/nf-core/nascent/pull/165) - groHMM overhaul. Removed R mclapply calls and replaced with Nextflow scatter gather for parameter tuning. This creates a job for each parameter set. ### Fixed @@ -29,7 +30,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Removed -- Support for groHMM tuning files +- [[#165](https://github.com/nf-core/nascent/pull/165)] - Removed support for groHMM tuning files. ## v2.2.0 - 2024-03-05 From 3c947b05c948c2501dae3c2c92c9e61ce9e0af72 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Tue, 15 Oct 2024 14:54:52 -0500 Subject: [PATCH 43/54] docs: Update groHMM docs --- docs/usage.md | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 214ecea8..f1a5b4b9 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -79,7 +79,36 @@ The default transcript identification option is PINTS, and HOMER if the transcri ### GroHMM -When selecting GroHMM as an option, the pipeline by default tests a list of preset hold-out parameters to select for the combination of arguments which would result in the lowest possible error rate during the transcript identification process. The user may also choose to provide their own list of hold-out parameters to test, or skip the tuning process altogether due to time constraints. If the tuning process is skipped ('--skip_tuning') then the user may indicate the specific holdout parameters to use ('--uts' and '--ltprobb') or choose to use the default parameters. +groHMM is split into two steps: parameter tuning and transcript identification. + +When running the pipeline with groHMM as a transcript identification method, the pipeline will automatically perform a parameter tuning process. This process is unique to the groHMM transcript identification method and is designed to select the optimal hold-out parameters for the groHMM algorithm. See [this issue](https://github.com/dankoc/groHMM/issues/4) for more information. + +In the groHMM vignette, the code is ran using a single mclapply call, which is a scatter gather approach. This is not ideal for large datasets, because it ends up being bottlenecked by the memory available on your local machine. To improve this, we have written a Nextflow script that runs the pipeline with a scatter gather approach. This is done by running the pipeline with a single hold-out parameter, and then the next parameter, and so on. This is more memory efficient and scales better to larger datasets. The results are then combined then combined in the end as intended and used in the transcript identification process. + +#### groHMM Parameters + +> The detectTranscripts function also uses two hold-out parameters. These parameters, specified by the arguments LtProbB and UTS, represents the log-transformed transition probability of switching from transcribed state to non-transcribed state and variance of the emission probability for reads in the non-transcribed state, respectively. Holdout parameters are used to optimize the performance of HMM predictions on known genes. + +In the pipeline, the parameters are specified as follows: +grohmm_min_uts = 5 +grohmm_max_uts = 45 +grohmm_min_ltprobb = -100 +grohmm_max_ltprobb = -400 + +Which will then create a job for each parameter combination. For example (5,-100), (5,-150), (10,-100), (10,-150)... + +If you have indentified a good set of parameters, you can run the pipeline with those parameters by specifying, all 4 values. + +For example if you have indentified that the best parameters for your data are 15,-200: + +```json +{ + "grohmm_min_uts": 15, + "grohmm_max_uts": 15, + "grohmm_min_ltprobb": -200, + "grohmm_max_ltprobb": -200 +} +``` ## Running the pipeline From 2b2b90f228e0eaaae37242a562f9d2f81cc75686 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Wed, 16 Oct 2024 14:04:34 -0500 Subject: [PATCH 44/54] fix(grohmm): Try calling no more than 10 cores --- bin/parameter_tuning.R | 2 +- modules/local/grohmm/parametertuning/main.nf | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/bin/parameter_tuning.R b/bin/parameter_tuning.R index 73c7e5f2..167f1e04 100755 --- a/bin/parameter_tuning.R +++ b/bin/parameter_tuning.R @@ -161,7 +161,7 @@ transcripts_ranges <- transcripts_ranges[!is.na(start(transcripts_ranges)) & !is print("Collapse annotations in preparation for overlap") kg_consensus <- makeConsensusAnnotations( transcripts_ranges, - mc.cores = args$cores + mc.cores = min(args$cores, 10) # 10 the number they had hardcoded in the grohmm package for some reason ) print("Finished consensus annotations") diff --git a/modules/local/grohmm/parametertuning/main.nf b/modules/local/grohmm/parametertuning/main.nf index ffa13d70..fef7cab3 100644 --- a/modules/local/grohmm/parametertuning/main.nf +++ b/modules/local/grohmm/parametertuning/main.nf @@ -1,7 +1,6 @@ process GROHMM_PARAMETERTUNING { tag "$meta.id|$UTS|$LtProbB" label 'process_high' - label 'error_retry' // array 10 conda "${moduleDir}/environment.yml" From e1015538da6a7510636983a693299bdd9d61364a Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Fri, 18 Oct 2024 10:08:33 -0500 Subject: [PATCH 45/54] chore: Add a custom makeConsensusAnnotations function --- bin/parameter_tuning.R | 151 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 150 insertions(+), 1 deletion(-) diff --git a/bin/parameter_tuning.R b/bin/parameter_tuning.R index 167f1e04..f6e750f1 100755 --- a/bin/parameter_tuning.R +++ b/bin/parameter_tuning.R @@ -91,6 +91,155 @@ if (is.null(args$bam_files)) { stop("Please provide a bam file", call. = FALSE) } +#' makeConsensusAnnotations Makes a consensus annotation +#' +#' Makes a non-overlapping consensus annotation. Gene annotations are often +#' overalpping due to #' multiple isoforms for a gene. +#' In consensus annotation, isoforms are first reduced so that only +#' redundant intervals are used to represent a genomic interval for a gene, +#' i.e., a gene id. +#' Remaining unresolved annotations are further reduced by truncating 3' +#' end of annotations. +#' +#' Supports parallel processing using mclapply in the 'parallel' package. +#' To change the number of processors, use the argument 'mc.cores'. +#' +#' @param ar GRanges of annotations to be collapsed. +#' @param minGap Minimun gap between overlapped annotations after truncated. +#' Default: 1L +#' @param minWidth Minimun width of consensus annotations. Default: 1000L +#' @param ... Extra argument passed to mclapply. +#' @return Returns GRanges object of annotations. +#' @author Minho Chae +#' @examples +#' ## Not run: +#' # library(TxDb.Hsapiens.UCSC.hg19.knownGene) +#' # txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene +#' # tx <- transcripts(txdb, columns=c("gene_id", "tx_id", "tx_name"), +#' filter=list(tx_chrom="chr7")) +#' # tx <- tx[grep("random", as.character(seqnames(tx)), invert=TRUE),] +#' # ca <- makeConsensusAnnotations(tx) +custom_makeConsensusAnnotations <- function(ar, minGap=1L, minWidth=1000L, ...) { + # check missing gene_id + missing <- elementNROWS(mcols(ar)[,"gene_id"]) == 0 + if (any(missing)) { + ar <- ar[!missing,] + warning(sum(missing), " ranges do not have gene_id and they are + dropped") + } + + many <- elementNROWS(mcols(ar)[,"gene_id"]) > 1 + if (any(many)) { + ar <- ar[!many,] + warning(sum(many), " ranges have multiple gene_id and they are + dropped") + } + + ar_list <- split(ar, unlist(mcols(ar)[,"gene_id"])) + singles <- unlist(ar_list[elementNROWS(ar_list) == 1]) + isoforms <- ar_list[elementNROWS(ar_list) > 1] + + message("Reduce isoforms(", length(isoforms),") ... ", appendLF=FALSE) + isoforms <- GRangesList(mclapply(isoforms, function(x) { + # For mixed strands or chrom, choose the longest + if ((length(seqlevelsInUse(x)) > 1) || + (length(unique(strand(x))) > 1)) { + result <- x[which.max(width(x)), "gene_id"] + } else { + dx <- disjoin(x) + mcols(dx)$gene_id <- mcols(x)$gene_id[1] + olcnt <- countOverlaps(dx, x) + + multi <- dx[olcnt > 1] # Use the disjoint ranges + # covered more than once + if (length(multi) == 0) { # For non-overlapping isoforms, + # choose the longest + result <- x[which.max(width(x)), "gene_id"] + } else if (length(multi) == 1) { + result <- multi + } else { + reduced <- reduce(multi) + if (length(reduced) == 1) + result <- reduced + else (length(reduced) > 1) + result <- reduced[which.max(width(reduced)),] + + } + mcols(result)$gene_id <- mcols(x)$gene_id[1] + } + return(result) + }, ...)) + isoforms <- unlist(isoforms) + message("OK") + + # Check redundancy + isoforms <- removeRedundant(isoforms) + singles <- removeRedundant(singles) + + o <- findOverlaps(singles, isoforms, type="equal") + if(length(o) != 0) + singles <- singles[-queryHits(o),] + + o <- findOverlaps(singles, isoforms, type="within") + if(length(o) != 0) + singles <- singles[-queryHits(o),] + + o <- findOverlaps(isoforms, singles, type="within") + if(length(o) != 0) + isoforms <- isoforms[-queryHits(o),] + + noiso <- sort(c(isoforms, singles[,"gene_id"])) + message("Truncate overlapped ranges ... ", appendLF=FALSE) + # with different gene_ids + while(!isDisjoint(noiso)) { + ol <- findOverlaps(noiso, drop.self=TRUE, drop.redundant=TRUE) + ol_gr <- GRangesList(lapply(1:length(ol), function(x) { + sort(c(noiso[queryHits(ol)[x]], + noiso[subjectHits(ol)[x]])) + })) + + # Truncate 3' end + ol_gr <- unlist(endoapply(ol_gr, function(x) { + if (as.character(strand(x[1,])) == "+") { + end(x[1,]) <- start(x[2,]) - minGap + # first range's end is truncated + } else { + start(x[2,]) <- end(x[1,]) + minGap + # sencond range's end is truncated + } + x + })) + + # Remove any ranges with duplicated names since they already adujsted + # in the previous call + ol_gr <- ol_gr[!duplicated(names(ol_gr)),] + + noiso <- noiso[-unique(c(queryHits(ol), subjectHits(ol))),] + # update noiso + noiso <- c(noiso, ol_gr) + } + message("OK") + + noiso <- noiso[width(noiso) >= minWidth,] + return(sort(noiso)) +} + +removeRedundant <- function(annox) { + o <- findOverlaps(annox, drop.self=TRUE, type="equal", + drop.redundant=TRUE) + if(length(o) != 0) + annox <- annox[-subjectHits(o),] + + o <- findOverlaps(annox, drop.self=TRUE, type="within", + drop.redundant=TRUE) + if(length(o) != 0) + annox <- annox[-queryHits(o),] + + return(annox) +} + + + # Load alignment files # TODO? CHANGE BASED ON PAIRED OR SINGLE END alignments <- c() @@ -159,7 +308,7 @@ seqlevels(transcripts_ranges) <- seqlevels(gtf) transcripts_ranges <- transcripts_ranges[!is.na(start(transcripts_ranges)) & !is.na(end(transcripts_ranges))] print("Collapse annotations in preparation for overlap") -kg_consensus <- makeConsensusAnnotations( +kg_consensus <- custom_makeConsensusAnnotations( transcripts_ranges, mc.cores = min(args$cores, 10) # 10 the number they had hardcoded in the grohmm package for some reason ) From 3028ae195d6060202f1dd510149cd8ba14a9c99d Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Fri, 18 Oct 2024 10:33:11 -0500 Subject: [PATCH 46/54] Try creating a custom makeConsensusAnnotations and setup test --- bin/custom_makeConsensusAnnotations | 174 ++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 bin/custom_makeConsensusAnnotations diff --git a/bin/custom_makeConsensusAnnotations b/bin/custom_makeConsensusAnnotations new file mode 100644 index 00000000..f6cada95 --- /dev/null +++ b/bin/custom_makeConsensusAnnotations @@ -0,0 +1,174 @@ +#' custom_makeConsensusAnnotations Makes a consensus annotation #' +#' Makes a non-overlapping consensus annotation. Gene annotations are often +#' overalpping due to #' multiple isoforms for a gene. +#' In consensus annotation, isoforms are first reduced so that only +#' redundant intervals are used to represent a genomic interval for a gene, +#' i.e., a gene id. +#' Remaining unresolved annotations are further reduced by truncating 3' +#' end of annotations. +#' +#' Supports parallel processing using mclapply in the 'parallel' package. +#' To change the number of processors, use the argument 'mc.cores'. +#' +#' @param ar GRanges of annotations to be collapsed. +#' @param minGap Minimun gap between overlapped annotations after truncated. +#' Default: 1L +#' @param minWidth Minimun width of consensus annotations. Default: 1000L +#' @param ... Extra argument passed to mclapply. +#' @return Returns GRanges object of annotations. +#' @author Minho Chae +#' @examples +#' ## Not run: +#' # library(TxDb.Hsapiens.UCSC.hg19.knownGene) +#' # txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene +#' # tx <- transcripts(txdb, columns=c("gene_id", "tx_id", "tx_name"), +#' filter=list(tx_chrom="chr7")) +#' # tx <- tx[grep("random", as.character(seqnames(tx)), invert=TRUE),] +#' # ca <- makeConsensusAnnotations(tx) +custom_makeConsensusAnnotations <- function(ar, minGap=1L, minWidth=1000L, ...) { + # Use subset instead of indexing to avoid copying + ar <- subset(ar, elementNROWS(mcols(ar)[,"gene_id"]) == 1) + if (length(ar) < length(ar)) { + warning(length(ar) - length(ar), " ranges were dropped due to missing or multiple gene_ids") + } + + # Use tapply instead of split to avoid creating intermediate lists + ar_list <- tapply(ar, mcols(ar)$gene_id, identity, simplify = FALSE) + singles <- ar_list[lengths(ar_list) == 1] + isoforms <- ar_list[lengths(ar_list) > 1] + + message("Reduce isoforms(", length(isoforms),") ... ", appendLF=FALSE) + isoforms <- mclapply(isoforms, function(x) { + # For mixed strands or chrom, choose the longest + if ((length(seqlevelsInUse(x)) > 1) || + (length(unique(strand(x))) > 1)) { + result <- x[which.max(width(x)), "gene_id"] + } else { + dx <- disjoin(x) + mcols(dx)$gene_id <- mcols(x)$gene_id[1] + olcnt <- countOverlaps(dx, x) + + multi <- dx[olcnt > 1] # Use the disjoint ranges + # covered more than once + if (length(multi) == 0) { # For non-overlapping isoforms, + # choose the longest + result <- x[which.max(width(x)), "gene_id"] + } else if (length(multi) == 1) { + result <- multi + } else { + reduced <- reduce(multi) + if (length(reduced) == 1) + result <- reduced + else (length(reduced) > 1) + result <- reduced[which.max(width(reduced)),] + + } + mcols(result)$gene_id <- mcols(x)$gene_id[1] + } + return(result) + }, ...) + isoforms <- unlist(GRangesList(isoforms)) + message("OK") + + # Combine removeRedundant operations + removeRedundant <- function(annox) { + o <- findOverlaps(annox, drop.self = TRUE, type = "any", drop.redundant = TRUE) + annox[-unique(c( + queryHits(o)[width(annox)[queryHits(o)] <= width(annox)[subjectHits(o)]], + subjectHits(o)[width(annox)[queryHits(o)] > width(annox)[subjectHits(o)]] + ))] + } + + isoforms <- removeRedundant(isoforms) + singles <- removeRedundant(do.call(c, singles)) + + # Use setdiff instead of multiple findOverlaps + singles <- setdiff(singles, isoforms) + + noiso <- sort(c(isoforms, singles)) + message("Truncate overlapped ranges ... ", appendLF=FALSE) +# Optimize the while loop + while(!isDisjoint(noiso)) { + ol <- findOverlaps(noiso, drop.self=TRUE, drop.redundant=TRUE) + ol_df <- as.data.frame(ol) + ol_df$strand <- as.character(strand(noiso)[ol_df$queryHits]) + + # Vectorized operations instead of lapply + ol_df$new_end <- ifelse(ol_df$strand == "+", + start(noiso)[ol_df$subjectHits] - minGap, + end(noiso)[ol_df$queryHits] + ) + ol_df$new_start <- ifelse(ol_df$strand == "+", + start(noiso)[ol_df$queryHits], + end(noiso)[ol_df$queryHits] + minGap + ) + + # Update ranges in one operation + noiso <- c( + noiso[-unique(c(ol_df$queryHits, ol_df$subjectHits))], + GRanges(seqnames(noiso)[ol_df$queryHits], + IRanges(ol_df$new_start, ol_df$new_end), + strand = ol_df$strand, + gene_id = mcols(noiso)$gene_id[ol_df$queryHits] + ) + ) + } + message("OK") + + subset(noiso, width >= minWidth) +} + +removeRedundant <- function(annox) { + o <- findOverlaps(annox, drop.self=TRUE, type="equal", + drop.redundant=TRUE) + if(length(o) != 0) + annox <- annox[-subjectHits(o),] + + o <- findOverlaps(annox, drop.self=TRUE, type="within", + drop.redundant=TRUE) + if(length(o) != 0) + annox <- annox[-queryHits(o),] + + return(annox) +} + + + +## TESTING +suppressPackageStartupMessages(library(groHMM)) +suppressPackageStartupMessages(library(testthat)) +suppressPackageStartupMessages(library(GenomicRanges)) +suppressPackageStartupMessages(library(rtracklayer)) + +test_that("custom_makeConsensusAnnotations maintains accuracy", { + # Load the GTF file + gtf_url <- "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf" + gtf <- import(gtf_url) + + # Extract transcripts from the GTF + transcripts <- gtf[gtf$type == "transcript"] + + # Create a sample GRanges object + set.seed(42) + n <- 100 + sample_ranges <- GRanges( + seqnames = "chr7", + ranges = IRanges( + start = sort(sample(1:max(end(transcripts)), n, replace = TRUE)), + width = sample(50:500, n, replace = TRUE) + ), + strand = sample(c("+", "-"), n, replace = TRUE), + gene_id = sample(transcripts$gene_id, n, replace = TRUE) + ) + print(sample_ranges) + + # Run both original and refactored functions + original_result <- groHMM::makeConsensusAnnotations(sample_ranges) + refactored_result <- custom_makeConsensusAnnotations(sample_ranges) + + # Compare results + expect_equal(length(original_result), length(refactored_result)) + expect_equal(sum(width(original_result)), sum(width(refactored_result))) + expect_equal(sort(unique(mcols(original_result)$gene_id)), + sort(unique(mcols(refactored_result)$gene_id))) +}) From d4ac91f83ec9f5ee1a9d2616e7d43563c3a60927 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Fri, 18 Oct 2024 10:57:11 -0500 Subject: [PATCH 47/54] Get custom running --- bin/custom_makeConsensusAnnotations | 82 ++++++++++++++++++++--------- 1 file changed, 56 insertions(+), 26 deletions(-) diff --git a/bin/custom_makeConsensusAnnotations b/bin/custom_makeConsensusAnnotations index f6cada95..a37d54a1 100644 --- a/bin/custom_makeConsensusAnnotations +++ b/bin/custom_makeConsensusAnnotations @@ -34,7 +34,7 @@ custom_makeConsensusAnnotations <- function(ar, minGap=1L, minWidth=1000L, ...) # Use tapply instead of split to avoid creating intermediate lists ar_list <- tapply(ar, mcols(ar)$gene_id, identity, simplify = FALSE) - singles <- ar_list[lengths(ar_list) == 1] + singles <- do.call(c, ar_list[lengths(ar_list) == 1]) isoforms <- ar_list[lengths(ar_list) > 1] message("Reduce isoforms(", length(isoforms),") ... ", appendLF=FALSE) @@ -70,24 +70,37 @@ custom_makeConsensusAnnotations <- function(ar, minGap=1L, minWidth=1000L, ...) isoforms <- unlist(GRangesList(isoforms)) message("OK") + # Ensure singles is a GRanges object + if (!is(singles, "GRanges")) { + singles <- unlist(GRangesList(singles)) + } + # Combine removeRedundant operations removeRedundant <- function(annox) { - o <- findOverlaps(annox, drop.self = TRUE, type = "any", drop.redundant = TRUE) - annox[-unique(c( - queryHits(o)[width(annox)[queryHits(o)] <= width(annox)[subjectHits(o)]], - subjectHits(o)[width(annox)[queryHits(o)] > width(annox)[subjectHits(o)]] - ))] + if (is(annox, "GRanges")) { + o <- findOverlaps(annox, drop.self = TRUE, type = "any", drop.redundant = TRUE) + annox[-unique(c( + queryHits(o)[width(annox)[queryHits(o)] <= width(annox)[subjectHits(o)]], + subjectHits(o)[width(annox)[queryHits(o)] > width(annox)[subjectHits(o)]] + ))] + } else { + warning("Input to removeRedundant is not a GRanges object") + print(class(annox)) # Debug: Print class of annox + print(str(annox)) # Debug: Print structure of annox + return(annox) + } } isoforms <- removeRedundant(isoforms) - singles <- removeRedundant(do.call(c, singles)) + singles <- removeRedundant(singles) # Use setdiff instead of multiple findOverlaps - singles <- setdiff(singles, isoforms) + singles <- GenomicRanges::setdiff(singles, isoforms) noiso <- sort(c(isoforms, singles)) message("Truncate overlapped ranges ... ", appendLF=FALSE) -# Optimize the while loop + + # Optimize the while loop while(!isDisjoint(noiso)) { ol <- findOverlaps(noiso, drop.self=TRUE, drop.redundant=TRUE) ol_df <- as.data.frame(ol) @@ -95,23 +108,18 @@ custom_makeConsensusAnnotations <- function(ar, minGap=1L, minWidth=1000L, ...) # Vectorized operations instead of lapply ol_df$new_end <- ifelse(ol_df$strand == "+", - start(noiso)[ol_df$subjectHits] - minGap, - end(noiso)[ol_df$queryHits] - ) + start(noiso)[ol_df$subjectHits] - minGap, + end(noiso)[ol_df$queryHits]) ol_df$new_start <- ifelse(ol_df$strand == "+", - start(noiso)[ol_df$queryHits], - end(noiso)[ol_df$queryHits] + minGap - ) + start(noiso)[ol_df$queryHits], + end(noiso)[ol_df$queryHits] + minGap) # Update ranges in one operation - noiso <- c( - noiso[-unique(c(ol_df$queryHits, ol_df$subjectHits))], - GRanges(seqnames(noiso)[ol_df$queryHits], - IRanges(ol_df$new_start, ol_df$new_end), - strand = ol_df$strand, - gene_id = mcols(noiso)$gene_id[ol_df$queryHits] - ) - ) + noiso <- c(noiso[-unique(c(ol_df$queryHits, ol_df$subjectHits))], + GRanges(seqnames(noiso)[ol_df$queryHits], + IRanges(ol_df$new_start, ol_df$new_end), + strand=ol_df$strand, + gene_id=mcols(noiso)$gene_id[ol_df$queryHits])) } message("OK") @@ -139,11 +147,11 @@ suppressPackageStartupMessages(library(groHMM)) suppressPackageStartupMessages(library(testthat)) suppressPackageStartupMessages(library(GenomicRanges)) suppressPackageStartupMessages(library(rtracklayer)) +# Load the GTF file +gtf_url <- "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf" +gtf <- import(gtf_url) test_that("custom_makeConsensusAnnotations maintains accuracy", { - # Load the GTF file - gtf_url <- "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf" - gtf <- import(gtf_url) # Extract transcripts from the GTF transcripts <- gtf[gtf$type == "transcript"] @@ -172,3 +180,25 @@ test_that("custom_makeConsensusAnnotations maintains accuracy", { expect_equal(sort(unique(mcols(original_result)$gene_id)), sort(unique(mcols(refactored_result)$gene_id))) }) + +test_that("custom_makeConsensusAnnotations maintains accuracy large", { + # Extract transcripts from the GTF + transcripts <- gtf[gtf$type == "transcript"] + + # Create a sample GRanges object + set.seed(42) + sample_ranges <- transcripts + print(sample_ranges) + + # Run both original and refactored functions + original_result <- groHMM::makeConsensusAnnotations(sample_ranges) + refactored_result <- custom_makeConsensusAnnotations(sample_ranges) + + # Compare results + expect_equal(length(original_result), length(refactored_result)) + expect_equal(sum(width(original_result)), sum(width(refactored_result))) + expect_equal( + sort(unique(mcols(original_result)$gene_id)), + sort(unique(mcols(refactored_result)$gene_id)) + ) +}) From 3444c5bf5d97c1e5fe82b9b1f359fb5fb4bbaa80 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Fri, 18 Oct 2024 15:50:26 -0500 Subject: [PATCH 48/54] Start over --- bin/custom_makeConsensusAnnotations | 181 ++++++++++++++-------------- 1 file changed, 93 insertions(+), 88 deletions(-) diff --git a/bin/custom_makeConsensusAnnotations b/bin/custom_makeConsensusAnnotations index a37d54a1..bd9040c4 100644 --- a/bin/custom_makeConsensusAnnotations +++ b/bin/custom_makeConsensusAnnotations @@ -26,19 +26,27 @@ #' # tx <- tx[grep("random", as.character(seqnames(tx)), invert=TRUE),] #' # ca <- makeConsensusAnnotations(tx) custom_makeConsensusAnnotations <- function(ar, minGap=1L, minWidth=1000L, ...) { - # Use subset instead of indexing to avoid copying - ar <- subset(ar, elementNROWS(mcols(ar)[,"gene_id"]) == 1) - if (length(ar) < length(ar)) { - warning(length(ar) - length(ar), " ranges were dropped due to missing or multiple gene_ids") + # check missing gene_id + missing <- elementNROWS(mcols(ar)[,"gene_id"]) == 0 + if (any(missing)) { + ar <- ar[!missing,] + warning(sum(missing), " ranges do not have gene_id and they are + dropped") } - # Use tapply instead of split to avoid creating intermediate lists - ar_list <- tapply(ar, mcols(ar)$gene_id, identity, simplify = FALSE) - singles <- do.call(c, ar_list[lengths(ar_list) == 1]) - isoforms <- ar_list[lengths(ar_list) > 1] + many <- elementNROWS(mcols(ar)[, "gene_id"]) > 1 + if (any(many)) { + ar <- ar[!many, ] + warning(sum(many), " ranges have multiple gene_id and they are + dropped") + } + + ar_list <- split(ar, unlist(mcols(ar)[, "gene_id"])) + singles <- unlist(ar_list[elementNROWS(ar_list) == 1]) + isoforms <- ar_list[elementNROWS(ar_list) > 1] message("Reduce isoforms(", length(isoforms),") ... ", appendLF=FALSE) - isoforms <- mclapply(isoforms, function(x) { + isoforms <- GRangesList(mclapply(isoforms, function(x) { # For mixed strands or chrom, choose the longest if ((length(seqlevelsInUse(x)) > 1) || (length(unique(strand(x))) > 1)) { @@ -66,64 +74,65 @@ custom_makeConsensusAnnotations <- function(ar, minGap=1L, minWidth=1000L, ...) mcols(result)$gene_id <- mcols(x)$gene_id[1] } return(result) - }, ...) - isoforms <- unlist(GRangesList(isoforms)) + }, ...)) + isoforms <- unlist(isoforms) message("OK") - # Ensure singles is a GRanges object - if (!is(singles, "GRanges")) { - singles <- unlist(GRangesList(singles)) - } + # Check redundancy + isoforms <- removeRedundant(isoforms) + singles <- removeRedundant(singles) - # Combine removeRedundant operations - removeRedundant <- function(annox) { - if (is(annox, "GRanges")) { - o <- findOverlaps(annox, drop.self = TRUE, type = "any", drop.redundant = TRUE) - annox[-unique(c( - queryHits(o)[width(annox)[queryHits(o)] <= width(annox)[subjectHits(o)]], - subjectHits(o)[width(annox)[queryHits(o)] > width(annox)[subjectHits(o)]] - ))] - } else { - warning("Input to removeRedundant is not a GRanges object") - print(class(annox)) # Debug: Print class of annox - print(str(annox)) # Debug: Print structure of annox - return(annox) - } + o <- findOverlaps(singles, isoforms, type = "equal") + if (length(o) != 0) { + singles <- singles[-queryHits(o), ] } - isoforms <- removeRedundant(isoforms) - singles <- removeRedundant(singles) + o <- findOverlaps(singles, isoforms, type = "within") + if (length(o) != 0) { + singles <- singles[-queryHits(o), ] + } - # Use setdiff instead of multiple findOverlaps - singles <- GenomicRanges::setdiff(singles, isoforms) + o <- findOverlaps(isoforms, singles, type = "within") + if (length(o) != 0) { + isoforms <- isoforms[-queryHits(o), ] + } - noiso <- sort(c(isoforms, singles)) + noiso <- sort(c(isoforms, singles[, "gene_id"])) message("Truncate overlapped ranges ... ", appendLF=FALSE) - - # Optimize the while loop + # with different gene_ids while(!isDisjoint(noiso)) { ol <- findOverlaps(noiso, drop.self=TRUE, drop.redundant=TRUE) - ol_df <- as.data.frame(ol) - ol_df$strand <- as.character(strand(noiso)[ol_df$queryHits]) - - # Vectorized operations instead of lapply - ol_df$new_end <- ifelse(ol_df$strand == "+", - start(noiso)[ol_df$subjectHits] - minGap, - end(noiso)[ol_df$queryHits]) - ol_df$new_start <- ifelse(ol_df$strand == "+", - start(noiso)[ol_df$queryHits], - end(noiso)[ol_df$queryHits] + minGap) - - # Update ranges in one operation - noiso <- c(noiso[-unique(c(ol_df$queryHits, ol_df$subjectHits))], - GRanges(seqnames(noiso)[ol_df$queryHits], - IRanges(ol_df$new_start, ol_df$new_end), - strand=ol_df$strand, - gene_id=mcols(noiso)$gene_id[ol_df$queryHits])) + ol_gr <- GRangesList(lapply(1:length(ol), function(x) { + sort(c( + noiso[queryHits(ol)[x]], + noiso[subjectHits(ol)[x]] + )) + })) + + # Truncate 3' end + ol_gr <- unlist(endoapply(ol_gr, function(x) { + if (as.character(strand(x[1, ])) == "+") { + end(x[1, ]) <- start(x[2, ]) - minGap + # first range's end is truncated + } else { + start(x[2, ]) <- end(x[1, ]) + minGap + # sencond range's end is truncated + } + x + })) + + # Remove any ranges with duplicated names since they already adujsted + # in the previous call + ol_gr <- ol_gr[!duplicated(names(ol_gr)), ] + + noiso <- noiso[-unique(c(queryHits(ol), subjectHits(ol))), ] + # update noiso + noiso <- c(noiso, ol_gr) } message("OK") - subset(noiso, width >= minWidth) + noiso <- noiso[width(noiso) >= minWidth, ] + return(sort(noiso)) } removeRedundant <- function(annox) { @@ -142,16 +151,17 @@ removeRedundant <- function(annox) { + ## TESTING suppressPackageStartupMessages(library(groHMM)) suppressPackageStartupMessages(library(testthat)) suppressPackageStartupMessages(library(GenomicRanges)) suppressPackageStartupMessages(library(rtracklayer)) -# Load the GTF file -gtf_url <- "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf" -gtf <- import(gtf_url) test_that("custom_makeConsensusAnnotations maintains accuracy", { + # Load the GTF file + gtf_url <- "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf" + gtf <- import(gtf_url) # Extract transcripts from the GTF transcripts <- gtf[gtf$type == "transcript"] @@ -159,46 +169,41 @@ test_that("custom_makeConsensusAnnotations maintains accuracy", { # Create a sample GRanges object set.seed(42) n <- 100 - sample_ranges <- GRanges( - seqnames = "chr7", - ranges = IRanges( - start = sort(sample(1:max(end(transcripts)), n, replace = TRUE)), - width = sample(50:500, n, replace = TRUE) - ), - strand = sample(c("+", "-"), n, replace = TRUE), - gene_id = sample(transcripts$gene_id, n, replace = TRUE) - ) + sample_ranges <- transcripts + print("Sample ranges:") print(sample_ranges) # Run both original and refactored functions original_result <- groHMM::makeConsensusAnnotations(sample_ranges) refactored_result <- custom_makeConsensusAnnotations(sample_ranges) + print("Refactored result:") + print(refactored_result) + # Compare results expect_equal(length(original_result), length(refactored_result)) expect_equal(sum(width(original_result)), sum(width(refactored_result))) - expect_equal(sort(unique(mcols(original_result)$gene_id)), - sort(unique(mcols(refactored_result)$gene_id))) -}) - -test_that("custom_makeConsensusAnnotations maintains accuracy large", { - # Extract transcripts from the GTF - transcripts <- gtf[gtf$type == "transcript"] - # Create a sample GRanges object - set.seed(42) - sample_ranges <- transcripts - print(sample_ranges) - - # Run both original and refactored functions - original_result <- groHMM::makeConsensusAnnotations(sample_ranges) - refactored_result <- custom_makeConsensusAnnotations(sample_ranges) + if (length(original_result) > 0 && length(refactored_result) > 0) { + expect_equal( + sort(unique(mcols(original_result)$gene_id)), + sort(unique(mcols(refactored_result)$gene_id)) + ) + } else { + print("Warning: One or both results are empty.") + } - # Compare results - expect_equal(length(original_result), length(refactored_result)) - expect_equal(sum(width(original_result)), sum(width(refactored_result))) - expect_equal( - sort(unique(mcols(original_result)$gene_id)), - sort(unique(mcols(refactored_result)$gene_id)) - ) + # Additional detailed comparisons + print(paste("Original result length:", length(original_result))) + print(paste("Refactored result length:", length(refactored_result))) + print(paste("Original result total width:", sum(width(original_result)))) + print(paste("Refactored result total width:", sum(width(refactored_result)))) + print(paste("Original result unique gene_ids:", length(unique(mcols(original_result)$gene_id)))) + print(paste("Refactored result unique gene_ids:", length(unique(mcols(refactored_result)$gene_id)))) + + # Compare the first few entries + print("First 5 entries of original result:") + print(head(original_result, 5)) + print("First 5 entries of refactored result:") + print(head(refactored_result, 5)) }) From 508452bf90505e68148aa6426510c07b832c7a13 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Fri, 18 Oct 2024 16:44:01 -0500 Subject: [PATCH 49/54] refactor: Split into chunks --- bin/custom_makeConsensusAnnotations | 91 +++++++++++++++++------------ 1 file changed, 53 insertions(+), 38 deletions(-) diff --git a/bin/custom_makeConsensusAnnotations b/bin/custom_makeConsensusAnnotations index bd9040c4..01f67a77 100644 --- a/bin/custom_makeConsensusAnnotations +++ b/bin/custom_makeConsensusAnnotations @@ -26,12 +26,11 @@ #' # tx <- tx[grep("random", as.character(seqnames(tx)), invert=TRUE),] #' # ca <- makeConsensusAnnotations(tx) custom_makeConsensusAnnotations <- function(ar, minGap=1L, minWidth=1000L, ...) { - # check missing gene_id - missing <- elementNROWS(mcols(ar)[,"gene_id"]) == 0 - if (any(missing)) { - ar <- ar[!missing,] - warning(sum(missing), " ranges do not have gene_id and they are - dropped") + # Check and remove ranges with missing or multiple gene_ids + valid_ranges <- vapply(mcols(ar)$gene_id, function(x) length(x) == 1, logical(1)) + if (any(!valid_ranges)) { + warning(sum(!valid_ranges), " ranges have missing or multiple gene_id and they are dropped") + ar <- ar[valid_ranges] } many <- elementNROWS(mcols(ar)[, "gene_id"]) > 1 @@ -41,42 +40,58 @@ custom_makeConsensusAnnotations <- function(ar, minGap=1L, minWidth=1000L, ...) dropped") } - ar_list <- split(ar, unlist(mcols(ar)[, "gene_id"])) - singles <- unlist(ar_list[elementNROWS(ar_list) == 1]) - isoforms <- ar_list[elementNROWS(ar_list) > 1] - - message("Reduce isoforms(", length(isoforms),") ... ", appendLF=FALSE) - isoforms <- GRangesList(mclapply(isoforms, function(x) { - # For mixed strands or chrom, choose the longest - if ((length(seqlevelsInUse(x)) > 1) || - (length(unique(strand(x))) > 1)) { - result <- x[which.max(width(x)), "gene_id"] - } else { - dx <- disjoin(x) - mcols(dx)$gene_id <- mcols(x)$gene_id[1] - olcnt <- countOverlaps(dx, x) - - multi <- dx[olcnt > 1] # Use the disjoint ranges - # covered more than once - if (length(multi) == 0) { # For non-overlapping isoforms, - # choose the longest + # Split into single-isoform and multi-isoform genes + gene_counts <- table(mcols(ar)$gene_id) + singles <- ar[mcols(ar)$gene_id %in% names(gene_counts[gene_counts == 1])] + multi_isoform_genes <- names(gene_counts[gene_counts > 1]) + + # Process multi-isoform genes in chunks + chunk_size <- 1000 # Adjust based on available memory + num_chunks <- ceiling(length(multi_isoform_genes) / chunk_size) + + isoforms <- GRangesList() + for (i in 1:num_chunks) { + chunk_start <- (i - 1) * chunk_size + 1 + chunk_end <- min(i * chunk_size, length(multi_isoform_genes)) + chunk_genes <- multi_isoform_genes[chunk_start:chunk_end] + + chunk_isoforms <- ar[mcols(ar)$gene_id %in% chunk_genes] + chunk_list <- split(chunk_isoforms, mcols(chunk_isoforms)$gene_id) + + message("Reduce isoforms (chunk ", i, "/", num_chunks, ") ... ", appendLF=FALSE) + chunk_result <- GRangesList(mclapply(chunk_list, function(x) { + # For mixed strands or chrom, choose the longest + if ((length(seqlevelsInUse(x)) > 1) || + (length(unique(strand(x))) > 1)) { result <- x[which.max(width(x)), "gene_id"] - } else if (length(multi) == 1) { - result <- multi } else { - reduced <- reduce(multi) - if (length(reduced) == 1) - result <- reduced - else (length(reduced) > 1) - result <- reduced[which.max(width(reduced)),] - + dx <- disjoin(x) + mcols(dx)$gene_id <- mcols(x)$gene_id[1] + olcnt <- countOverlaps(dx, x) + + multi <- dx[olcnt > 1] # Use the disjoint ranges + # covered more than once + if (length(multi) == 0) { # For non-overlapping isoforms, + # choose the longest + result <- x[which.max(width(x)), "gene_id"] + } else if (length(multi) == 1) { + result <- multi + } else { + reduced <- reduce(multi) + if (length(reduced) == 1) + result <- reduced + else (length(reduced) > 1) + result <- reduced[which.max(width(reduced)),] + + } + mcols(result)$gene_id <- mcols(x)$gene_id[1] } - mcols(result)$gene_id <- mcols(x)$gene_id[1] - } - return(result) - }, ...)) + return(result) + }, ...)) + isoforms <- c(isoforms, chunk_result) + message("OK") + } isoforms <- unlist(isoforms) - message("OK") # Check redundancy isoforms <- removeRedundant(isoforms) From b538a166542de1f733ab0601560eb49dacd8c737 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Sat, 19 Oct 2024 17:44:08 -0500 Subject: [PATCH 50/54] More tests --- bin/custom_makeConsensusAnnotations | 69 +++++++++++-- bin/parameter_tuning.R | 150 +--------------------------- 2 files changed, 60 insertions(+), 159 deletions(-) diff --git a/bin/custom_makeConsensusAnnotations b/bin/custom_makeConsensusAnnotations index 01f67a77..74c5a43e 100644 --- a/bin/custom_makeConsensusAnnotations +++ b/bin/custom_makeConsensusAnnotations @@ -33,13 +33,6 @@ custom_makeConsensusAnnotations <- function(ar, minGap=1L, minWidth=1000L, ...) ar <- ar[valid_ranges] } - many <- elementNROWS(mcols(ar)[, "gene_id"]) > 1 - if (any(many)) { - ar <- ar[!many, ] - warning(sum(many), " ranges have multiple gene_id and they are - dropped") - } - # Split into single-isoform and multi-isoform genes gene_counts <- table(mcols(ar)$gene_id) singles <- ar[mcols(ar)$gene_id %in% names(gene_counts[gene_counts == 1])] @@ -173,13 +166,67 @@ suppressPackageStartupMessages(library(testthat)) suppressPackageStartupMessages(library(GenomicRanges)) suppressPackageStartupMessages(library(rtracklayer)) -test_that("custom_makeConsensusAnnotations maintains accuracy", { +# test_that("custom_makeConsensusAnnotations maintains accuracy", { +# # Load the GTF file +# gtf_url <- "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf" +# gtf <- import(gtf_url) + +# # Extract transcripts from the GTF +# transcripts <- gtf[gtf$type == "transcript"] + +# # Create a sample GRanges object +# set.seed(42) +# n <- 100 +# sample_ranges <- transcripts +# print("Sample ranges:") +# print(sample_ranges) + +# # Run both original and refactored functions +# original_result <- groHMM::makeConsensusAnnotations(sample_ranges) +# refactored_result <- custom_makeConsensusAnnotations(sample_ranges) + +# print("Refactored result:") +# print(refactored_result) + +# # Compare results +# expect_equal(length(original_result), length(refactored_result)) +# expect_equal(sum(width(original_result)), sum(width(refactored_result))) + +# if (length(original_result) > 0 && length(refactored_result) > 0) { +# expect_equal( +# sort(unique(mcols(original_result)$gene_id)), +# sort(unique(mcols(refactored_result)$gene_id)) +# ) +# } else { +# print("Warning: One or both results are empty.") +# } + +# # Additional detailed comparisons +# print(paste("Original result length:", length(original_result))) +# print(paste("Refactored result length:", length(refactored_result))) +# print(paste("Original result total width:", sum(width(original_result)))) +# print(paste("Refactored result total width:", sum(width(refactored_result)))) +# print(paste("Original result unique gene_ids:", length(unique(mcols(original_result)$gene_id)))) +# print(paste("Refactored result unique gene_ids:", length(unique(mcols(refactored_result)$gene_id)))) + +# # Compare the first few entries +# print("First 5 entries of original result:") +# print(head(original_result, 5)) +# print("First 5 entries of refactored result:") +# print(head(refactored_result, 5)) +# }) + +test_that("chm13 custom_makeConsensusAnnotations maintains accuracy", { # Load the GTF file - gtf_url <- "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf" - gtf <- import(gtf_url) + gff_url <- "./chm13v2.0_RefSeq_Liftoff_v5.1.gff3.gz" + gff <- import(gff_url) + print("GFF:") + print(gff) # Extract transcripts from the GTF - transcripts <- gtf[gtf$type == "transcript"] + transcripts <- gff[gff$type == "transcript"] + print("Transcripts:") + print(transcripts) # Create a sample GRanges object set.seed(42) diff --git a/bin/parameter_tuning.R b/bin/parameter_tuning.R index f6e750f1..b630f003 100755 --- a/bin/parameter_tuning.R +++ b/bin/parameter_tuning.R @@ -91,154 +91,6 @@ if (is.null(args$bam_files)) { stop("Please provide a bam file", call. = FALSE) } -#' makeConsensusAnnotations Makes a consensus annotation -#' -#' Makes a non-overlapping consensus annotation. Gene annotations are often -#' overalpping due to #' multiple isoforms for a gene. -#' In consensus annotation, isoforms are first reduced so that only -#' redundant intervals are used to represent a genomic interval for a gene, -#' i.e., a gene id. -#' Remaining unresolved annotations are further reduced by truncating 3' -#' end of annotations. -#' -#' Supports parallel processing using mclapply in the 'parallel' package. -#' To change the number of processors, use the argument 'mc.cores'. -#' -#' @param ar GRanges of annotations to be collapsed. -#' @param minGap Minimun gap between overlapped annotations after truncated. -#' Default: 1L -#' @param minWidth Minimun width of consensus annotations. Default: 1000L -#' @param ... Extra argument passed to mclapply. -#' @return Returns GRanges object of annotations. -#' @author Minho Chae -#' @examples -#' ## Not run: -#' # library(TxDb.Hsapiens.UCSC.hg19.knownGene) -#' # txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene -#' # tx <- transcripts(txdb, columns=c("gene_id", "tx_id", "tx_name"), -#' filter=list(tx_chrom="chr7")) -#' # tx <- tx[grep("random", as.character(seqnames(tx)), invert=TRUE),] -#' # ca <- makeConsensusAnnotations(tx) -custom_makeConsensusAnnotations <- function(ar, minGap=1L, minWidth=1000L, ...) { - # check missing gene_id - missing <- elementNROWS(mcols(ar)[,"gene_id"]) == 0 - if (any(missing)) { - ar <- ar[!missing,] - warning(sum(missing), " ranges do not have gene_id and they are - dropped") - } - - many <- elementNROWS(mcols(ar)[,"gene_id"]) > 1 - if (any(many)) { - ar <- ar[!many,] - warning(sum(many), " ranges have multiple gene_id and they are - dropped") - } - - ar_list <- split(ar, unlist(mcols(ar)[,"gene_id"])) - singles <- unlist(ar_list[elementNROWS(ar_list) == 1]) - isoforms <- ar_list[elementNROWS(ar_list) > 1] - - message("Reduce isoforms(", length(isoforms),") ... ", appendLF=FALSE) - isoforms <- GRangesList(mclapply(isoforms, function(x) { - # For mixed strands or chrom, choose the longest - if ((length(seqlevelsInUse(x)) > 1) || - (length(unique(strand(x))) > 1)) { - result <- x[which.max(width(x)), "gene_id"] - } else { - dx <- disjoin(x) - mcols(dx)$gene_id <- mcols(x)$gene_id[1] - olcnt <- countOverlaps(dx, x) - - multi <- dx[olcnt > 1] # Use the disjoint ranges - # covered more than once - if (length(multi) == 0) { # For non-overlapping isoforms, - # choose the longest - result <- x[which.max(width(x)), "gene_id"] - } else if (length(multi) == 1) { - result <- multi - } else { - reduced <- reduce(multi) - if (length(reduced) == 1) - result <- reduced - else (length(reduced) > 1) - result <- reduced[which.max(width(reduced)),] - - } - mcols(result)$gene_id <- mcols(x)$gene_id[1] - } - return(result) - }, ...)) - isoforms <- unlist(isoforms) - message("OK") - - # Check redundancy - isoforms <- removeRedundant(isoforms) - singles <- removeRedundant(singles) - - o <- findOverlaps(singles, isoforms, type="equal") - if(length(o) != 0) - singles <- singles[-queryHits(o),] - - o <- findOverlaps(singles, isoforms, type="within") - if(length(o) != 0) - singles <- singles[-queryHits(o),] - - o <- findOverlaps(isoforms, singles, type="within") - if(length(o) != 0) - isoforms <- isoforms[-queryHits(o),] - - noiso <- sort(c(isoforms, singles[,"gene_id"])) - message("Truncate overlapped ranges ... ", appendLF=FALSE) - # with different gene_ids - while(!isDisjoint(noiso)) { - ol <- findOverlaps(noiso, drop.self=TRUE, drop.redundant=TRUE) - ol_gr <- GRangesList(lapply(1:length(ol), function(x) { - sort(c(noiso[queryHits(ol)[x]], - noiso[subjectHits(ol)[x]])) - })) - - # Truncate 3' end - ol_gr <- unlist(endoapply(ol_gr, function(x) { - if (as.character(strand(x[1,])) == "+") { - end(x[1,]) <- start(x[2,]) - minGap - # first range's end is truncated - } else { - start(x[2,]) <- end(x[1,]) + minGap - # sencond range's end is truncated - } - x - })) - - # Remove any ranges with duplicated names since they already adujsted - # in the previous call - ol_gr <- ol_gr[!duplicated(names(ol_gr)),] - - noiso <- noiso[-unique(c(queryHits(ol), subjectHits(ol))),] - # update noiso - noiso <- c(noiso, ol_gr) - } - message("OK") - - noiso <- noiso[width(noiso) >= minWidth,] - return(sort(noiso)) -} - -removeRedundant <- function(annox) { - o <- findOverlaps(annox, drop.self=TRUE, type="equal", - drop.redundant=TRUE) - if(length(o) != 0) - annox <- annox[-subjectHits(o),] - - o <- findOverlaps(annox, drop.self=TRUE, type="within", - drop.redundant=TRUE) - if(length(o) != 0) - annox <- annox[-queryHits(o),] - - return(annox) -} - - # Load alignment files # TODO? CHANGE BASED ON PAIRED OR SINGLE END @@ -371,3 +223,5 @@ if (file.exists(r_log_file) == FALSE) { print(a) sink() } + + From ef677b1a1fbaab16b1f94c5d8ef4134e379ff7a2 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Sat, 19 Oct 2024 17:47:18 -0500 Subject: [PATCH 51/54] fix(grohmm): Support gxf Scrap all of the CHM13 attempts I think it's the gtf conversion stuff that's causing the memory overload. But I can get them all to load locally. So I'm going to try to use gff3 files, if they're supplied, otherwise fall back to gtf. --- ...ions => custom_makeConsensusAnnotations.R} | 14 ++ bin/parameter_tuning.R | 71 +-------- bin/transcriptcalling_grohmm.R | 6 +- modules/local/grohmm/parametertuning/main.nf | 4 +- .../local/grohmm/transcriptcalling/main.nf | 4 +- .../local/transcript_identification.nf | 4 +- workflows/nascent.nf | 4 +- .../grohmm/only_gff/main.nf.test | 57 ++++++++ .../grohmm/only_gff/main.nf.test.snap | 136 ++++++++++++++++++ 9 files changed, 226 insertions(+), 74 deletions(-) rename bin/{custom_makeConsensusAnnotations => custom_makeConsensusAnnotations.R} (96%) create mode 100644 workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test create mode 100644 workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test.snap diff --git a/bin/custom_makeConsensusAnnotations b/bin/custom_makeConsensusAnnotations.R similarity index 96% rename from bin/custom_makeConsensusAnnotations rename to bin/custom_makeConsensusAnnotations.R index 74c5a43e..f51eda7d 100644 --- a/bin/custom_makeConsensusAnnotations +++ b/bin/custom_makeConsensusAnnotations.R @@ -165,6 +165,7 @@ suppressPackageStartupMessages(library(groHMM)) suppressPackageStartupMessages(library(testthat)) suppressPackageStartupMessages(library(GenomicRanges)) suppressPackageStartupMessages(library(rtracklayer)) +library(GenomicFeatures) # test_that("custom_makeConsensusAnnotations maintains accuracy", { # # Load the GTF file @@ -216,6 +217,17 @@ suppressPackageStartupMessages(library(rtracklayer)) # print(head(refactored_result, 5)) # }) +gff <- "./chm13v2.0_RefSeq_Liftoff_v5.1.gff3.gz" +kg_db <- makeTxDbFromGFF(gff) +kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name")) +print("Collapse annotations in preparation for overlap") +kg_consensus <- makeConsensusAnnotations( + kg_tx, + mc.cores = 2 +) +print("Kg consensus:") +print(kg_consensus) + test_that("chm13 custom_makeConsensusAnnotations maintains accuracy", { # Load the GTF file gff_url <- "./chm13v2.0_RefSeq_Liftoff_v5.1.gff3.gz" @@ -239,6 +251,8 @@ test_that("chm13 custom_makeConsensusAnnotations maintains accuracy", { original_result <- groHMM::makeConsensusAnnotations(sample_ranges) refactored_result <- custom_makeConsensusAnnotations(sample_ranges) + print("Original result:") + print(original_result) print("Refactored result:") print(refactored_result) diff --git a/bin/parameter_tuning.R b/bin/parameter_tuning.R index b630f003..de323bf5 100755 --- a/bin/parameter_tuning.R +++ b/bin/parameter_tuning.R @@ -56,11 +56,11 @@ parser$add_argument( ) parser$add_argument( "-g", - "--gtf", + "--gxf", type = "character", default = NULL, metavar = "string", - help = "GTF File to create TxDb", + help = "GFF/GTF File to create TxDb", required = TRUE ) parser$add_argument( @@ -75,7 +75,6 @@ parser$add_argument( "-m", "--memory", type = "integer", - default = 56000, metavar = "integer", help = "Amount of memory in MB" ) @@ -91,7 +90,6 @@ if (is.null(args$bam_files)) { stop("Please provide a bam file", call. = FALSE) } - # Load alignment files # TODO? CHANGE BASED ON PAIRED OR SINGLE END alignments <- c() @@ -104,65 +102,12 @@ for (bam in args$bam_files) { } print("Input transcript annotations") -# Import the GTF file using rtracklayer -gtf <- import(args$gtf) - -# Exclude any transcripts located on chromosomes labeled with "random" -gtf <- gtf[!grepl("random", seqnames(gtf)), ] - -# Extract transcript-level features -transcripts_gtf <- gtf[gtf$type == "transcript", ] -# Extract exon features -exons_gtf <- gtf[gtf$type == "exon", ] - -# Ensure that the 'transcript_id' and 'gene_id' columns are present -if (!all(c("transcript_id", "gene_id") %in% names(mcols(exons_gtf)))) { - stop("The GTF file lacks 'transcript_id' or 'gene_id' in its attributes.") -} - -# Group exons by transcript_id -exons_by_transcript <- split(exons_gtf, exons_gtf$transcript_id) - -# Diagnostic prints -print(paste("Number of transcripts:", length(exons_by_transcript))) - -# Reduce exons to create transcript ranges -transcripts_ranges <- GenomicRanges::reduce(exons_by_transcript) -transcripts_ranges <- unlist(transcripts_ranges, use.names = TRUE) - -# Diagnostic prints after reduction -print(paste("Number of transcripts_ranges after reduction:", length(transcripts_ranges))) - -# Create mapping dataframe -mapping_df <- data.frame( - transcript_id = names(transcripts_ranges), - gene_id = vapply(exons_by_transcript[names(transcripts_ranges)], function(x) unique(x$gene_id)[1], character(1)), - stringsAsFactors = FALSE -) - -# Check for length mismatch -if (nrow(mapping_df) != length(transcripts_ranges)) { - stop(paste("Length mismatch between mapping_df and transcripts_ranges:", nrow(mapping_df), length(transcripts_ranges))) -} - -# Assign metadata -mcols(transcripts_ranges)$transcript_id <- mapping_df$transcript_id -mcols(transcripts_ranges)$gene_id <- mapping_df$gene_id - -# Assign seqnames and strand from the exons -seqnames(transcripts_ranges) <- seqnames(exons_gtf[match(names(transcripts_ranges), exons_gtf$transcript_id)]) -strand(transcripts_ranges) <- strand(exons_gtf[match(names(transcripts_ranges), exons_gtf$transcript_id)]) - -# Ensure that seqlevels are set correctly -seqlevels(transcripts_ranges) <- seqlevels(gtf) - -# Remove any transcripts with NA values -transcripts_ranges <- transcripts_ranges[!is.na(start(transcripts_ranges)) & !is.na(end(transcripts_ranges))] - +kg_db <- makeTxDbFromGFF(args$gxf) +kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name")) print("Collapse annotations in preparation for overlap") -kg_consensus <- custom_makeConsensusAnnotations( - transcripts_ranges, - mc.cores = min(args$cores, 10) # 10 the number they had hardcoded in the grohmm package for some reason +kg_consensus <- makeConsensusAnnotations( + kg_tx, + mc.cores = args$cores ) print("Finished consensus annotations") @@ -223,5 +168,3 @@ if (file.exists(r_log_file) == FALSE) { print(a) sink() } - - diff --git a/bin/transcriptcalling_grohmm.R b/bin/transcriptcalling_grohmm.R index 883daa18..5c48c64e 100755 --- a/bin/transcriptcalling_grohmm.R +++ b/bin/transcriptcalling_grohmm.R @@ -64,11 +64,11 @@ parser$add_argument( ) parser$add_argument( "-g", - "--gtf", + "--gxf", type = "character", default = NULL, metavar = "string", - help = "GTF File to create TxDb", + help = "GFF/GTF File to create TxDb", required = TRUE ) parser$add_argument( @@ -143,7 +143,7 @@ write.table( ) print("Input transcript annotations") -kg_db <- makeTxDbFromGFF(args$gtf) +kg_db <- makeTxDbFromGFF(args$gxf) kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name")) # TODO I wonder if I could speed things up by filtering by chromosome at the Nextflow level # https://github.com/google/deepvariant/issues/744 diff --git a/modules/local/grohmm/parametertuning/main.nf b/modules/local/grohmm/parametertuning/main.nf index fef7cab3..ad5ac5c7 100644 --- a/modules/local/grohmm/parametertuning/main.nf +++ b/modules/local/grohmm/parametertuning/main.nf @@ -10,7 +10,7 @@ process GROHMM_PARAMETERTUNING { input: tuple val(meta), path(bams), path(bais) - path gtf + path gxf each UTS each LtProbB @@ -29,7 +29,7 @@ process GROHMM_PARAMETERTUNING { parameter_tuning.R \\ --bam_file ${bams} \\ --outprefix ${prefix} \\ - --gtf $gtf \\ + --gxf $gxf \\ --uts $UTS \\ --ltprobb $LtProbB \\ --outdir ./ \\ diff --git a/modules/local/grohmm/transcriptcalling/main.nf b/modules/local/grohmm/transcriptcalling/main.nf index 7935daae..07cba0ca 100644 --- a/modules/local/grohmm/transcriptcalling/main.nf +++ b/modules/local/grohmm/transcriptcalling/main.nf @@ -10,7 +10,7 @@ process GROHMM_TRANSCRIPTCALLING { input: tuple val(meta), path(bams), path(bais), path(tuning_file) - path gtf + path gxf output: tuple val(meta), path("*.transcripts.txt"), emit: transcripts @@ -32,7 +32,7 @@ process GROHMM_TRANSCRIPTCALLING { --bam_file ${bams} \\ --tuning_file ${tuning_file} \\ --outprefix ${prefix} \\ - --gtf $gtf \\ + --gxf $gxf \\ --outdir ./ \\ --cores $task.cpus \\ --memory ${task.memory.toMega()} \\ diff --git a/subworkflows/local/transcript_identification.nf b/subworkflows/local/transcript_identification.nf index b1217a90..b2f4883d 100644 --- a/subworkflows/local/transcript_identification.nf +++ b/subworkflows/local/transcript_identification.nf @@ -14,7 +14,7 @@ include { BEDTOOLS_INTERSECT } from '../../modules/nf-core/bedtools/intersect/ma workflow TRANSCRIPT_INDENTIFICATION { take: group_bam_bai - gtf + gxf fasta chrom_sizes @@ -25,7 +25,7 @@ workflow TRANSCRIPT_INDENTIFICATION { grohmm_td_plot = Channel.empty() if(!params.skip_grohmm && params.assay_type == "GROseq") { - GROHMM ( group_bam_bai, gtf ) + GROHMM ( group_bam_bai, gxf ) ch_identification_bed = ch_identification_bed.mix(GROHMM.out.bed) grohmm_td_plot = GROHMM.out.td_plot ch_versions = ch_versions.mix(GROHMM.out.versions.first()) diff --git a/workflows/nascent.nf b/workflows/nascent.nf index d4675c1d..06e7f16d 100644 --- a/workflows/nascent.nf +++ b/workflows/nascent.nf @@ -286,9 +286,11 @@ workflow NASCENT { ch_group_bam_bai = ch_group_bam.join(ch_group_bai, by: [0]) + ch_gxf = ch_gff ? ch_gff : PREPARE_GENOME.out.gtf + TRANSCRIPT_INDENTIFICATION ( ch_group_bam_bai, - PREPARE_GENOME.out.gtf, + ch_gxf, PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.chrom_sizes, ) diff --git a/workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test b/workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test new file mode 100644 index 00000000..4e8ab43d --- /dev/null +++ b/workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test @@ -0,0 +1,57 @@ +nextflow_pipeline { + + name "GROHMM" + script "../../../../../main.nf" + tag "input" + tag "grohmm" + tag "gff" + tag "chm13" + // triggers 'bin/parameter_tuning.R', 'bin/transcriptcalling_grohmm.R' + + test("Should run groHMM with only a GFF file") { + when { + params { + outdir = "$outputDir" + skip_grohmm = false + gff = 'https://huggingface.co/datasets/edmundmiller/nascent-test-data/resolve/main/chm13v2.0_RefSeq_Liftoff_v5.1.chr21.gff3.gz' + // TODO Use CHM13 Fasta + gtf = null + bed = null + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(UTILS.removeNextflowVersion("$outputDir/pipeline_info/nf_core_pipeline_software_mqc_versions.yml")).match("software_versions") }, + { assert snapshot( + workflow.trace.tasks().size(), + path("$outputDir/transcript_identification/homer/cd4.bed"), + path("$outputDir/transcript_identification/homer/jurkat.bed"), + // FIXME Not determinstic because of the order of files + // Add to the other tests when fixed + // UTILS.getAllFilesFromDir("$outputDir/transcript_identification/pints/", ".bed"), + path("$outputDir/transcript_identification/intersect/").list(), + path("$outputDir/transcript_identification/filtered/").list(), + path("$outputDir/transcript_identification/intersect/").list(), + path("$outputDir/transcript_identification/filtered/").list(), + path("$outputDir/transcript_identification/grohmm/cd4.eval.txt"), + path("$outputDir/transcript_identification/grohmm/cd4.final.transcripts.bed"), + path("$outputDir/transcript_identification/grohmm/cd4.tdFinal.txt"), + path("$outputDir/transcript_identification/grohmm/cd4.tdplot_mqc.png").exists(), + path("$outputDir/transcript_identification/grohmm/cd4.transcripts.txt"), + path("$outputDir/transcript_identification/grohmm/jurkat.eval.txt"), + path("$outputDir/transcript_identification/grohmm/jurkat.final.transcripts.bed"), + path("$outputDir/transcript_identification/grohmm/jurkat.tdFinal.txt"), + path("$outputDir/transcript_identification/grohmm/jurkat.tdplot_mqc.png").exists(), + path("$outputDir/transcript_identification/grohmm/jurkat.transcripts.txt"), + // FIXME Not determinstic because of the order of files + // Add to the other tests when fixed + // path("$outputDir/quantification/").list(), + path("$outputDir/multiqc/multiqc_report.html").exists(), + ).match("output_files") + } + ) + } + } +} diff --git a/workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test.snap b/workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test.snap new file mode 100644 index 00000000..3df385a7 --- /dev/null +++ b/workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test.snap @@ -0,0 +1,136 @@ +{ + "output_files": { + "content": [ + 217, + "cd4.bed:md5,b55e5290d78941f36c3d1ecfef8e0062", + "jurkat.bed:md5,383cfaf10535dbe5d7f47607e345f4cb", + [ + "cd4_intersect.bed:md5,51800dceda87c980e91db92b265b7ec6", + "jurkat_intersect.bed:md5,6d205f43e81053ab3a8af5096c065b66" + ], + [ + "cd4_filtered.bed:md5,3de4c6a675817a0661bb24edfd06d155", + "jurkat_filtered.bed:md5,191963fbc44ef9e5c3261a3fb96361a0" + ], + [ + "cd4_intersect.bed:md5,51800dceda87c980e91db92b265b7ec6", + "jurkat_intersect.bed:md5,6d205f43e81053ab3a8af5096c065b66" + ], + [ + "cd4_filtered.bed:md5,3de4c6a675817a0661bb24edfd06d155", + "jurkat_filtered.bed:md5,191963fbc44ef9e5c3261a3fb96361a0" + ], + "cd4.eval.txt:md5,b58c1db2cceac1fe1d89dec5838317d1", + "cd4.final.transcripts.bed:md5,a34dc280b3b023186b4935780d379338", + "cd4.tdFinal.txt:md5,8b49613bc2aa4dcbc23d852e46c41fb7", + true, + "cd4.transcripts.txt:md5,891a07c573dac76de5c466453c206c4f", + "jurkat.eval.txt:md5,1b9b3cf2f8b832ee4863168a07630db7", + "jurkat.final.transcripts.bed:md5,0d60cc43aadb918debf7cbd9113d53b4", + "jurkat.tdFinal.txt:md5,d10046e79b6c6ccff86d42b1bd59cfa4", + true, + "jurkat.transcripts.txt:md5,aae6fb7cf85d2cc91db484a9399da876", + true + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-19T21:30:10.156904" + }, + "software_versions": { + "content": [ + { + "BBMAP_PILEUP": { + "bbmap": 39.01, + "samtools": "1.16.1", + "pigz": 2.6 + }, + "BEDTOOLS_GENOMECOV_MINUS": { + "bedtools": "2.31.1" + }, + "BEDTOOLS_GENOMECOV_PLUS": { + "bedtools": "2.31.1" + }, + "BEDTOOLS_INTERSECT": { + "bedtools": "2.31.1" + }, + "BEDTOOLS_INTERSECT_FILTER": { + "bedtools": "2.31.1" + }, + "BWA_INDEX": { + "bwa": "0.7.18-r1243-dirty" + }, + "BWA_MEM": { + "bwa": "0.7.18-r1243-dirty", + "samtools": 1.2 + }, + "CUSTOM_GETCHROMSIZES": { + "getchromsizes": 1.2 + }, + "DEEPTOOLS_BAMCOVERAGE_MINUS": { + "deeptools": "3.5.1" + }, + "DEEPTOOLS_BAMCOVERAGE_PLUS": { + "deeptools": "3.5.1" + }, + "FASTP": { + "fastp": "0.23.4" + }, + "FASTQC": { + "fastqc": "0.12.1" + }, + "GFFREAD": { + "gffread": "0.12.7" + }, + "GROHMM_PARAMETERTUNING": { + "r-base": "4.3.3", + "bioconductor-grohmm": "1.39.0" + }, + "GTF2BED": { + "perl": "5.26.2" + }, + "GUNZIP_GFF": { + "gunzip": 1.1 + }, + "HOMER_MAKETAGDIRECTORY": { + "homer": 4.11, + "samtools": 1.11 + }, + "PINTS_CALLER": { + "python": "3.10.6", + "pints": "1.1.8" + }, + "PRESEQ_CCURVE": { + "preseq": "3.1.1" + }, + "PRESEQ_LCEXTRAP": { + "preseq": "3.1.1" + }, + "RSEQC_INFEREXPERIMENT": { + "rseqc": "5.0.2" + }, + "RSEQC_READDISTRIBUTION": { + "rseqc": "5.0.2" + }, + "RSEQC_READDUPLICATION": { + "rseqc": "5.0.2" + }, + "SUBREAD_FEATURECOUNTS_GENE": { + "subread": "2.0.1" + }, + "SUBREAD_FEATURECOUNTS_PREDICTED": { + "subread": "2.0.1" + }, + "Workflow": { + "nf-core/nascent": "v2.3.0dev" + } + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-19T21:30:10.110132" + } +} \ No newline at end of file From afbb7b63ced0d30f016ec9d4b134536854dadc9d Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Sun, 20 Oct 2024 21:47:44 -0500 Subject: [PATCH 52/54] fix(chm13): Prevent some errors during transcriptcalling --- bin/transcriptcalling_grohmm.R | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/bin/transcriptcalling_grohmm.R b/bin/transcriptcalling_grohmm.R index 5c48c64e..3f362275 100755 --- a/bin/transcriptcalling_grohmm.R +++ b/bin/transcriptcalling_grohmm.R @@ -180,7 +180,19 @@ con_expressed <- get_expressed_annotations( b_plus <- breakTranscriptsOnGenes(tx_hmm, kg_consensus, strand = "+") b_minus <- breakTranscriptsOnGenes(tx_hmm, kg_consensus, strand = "-") tx_broken <- c(b_plus, b_minus) -tx_final <- combineTranscripts(tx_broken, kg_consensus) +# Assign unique IDs if they're missing +if (is.null(mcols(tx_broken)$transcript_id) || any(is.na(mcols(tx_broken)$transcript_id))) { + mcols(tx_broken)$transcript_id <- paste0("TX", seq_along(tx_broken)) +} + +# Filter out any transcripts with NA values in start or end positions +tx_broken_filtered <- tx_broken[!is.na(start(tx_broken)) & !is.na(end(tx_broken))] + +# Ensure that kg_consensus also doesn't contain NA values +kg_consensus_filtered <- kg_consensus[!is.na(start(kg_consensus)) & !is.na(end(kg_consensus))] + +# Now call combineTranscripts with the filtered data +tx_final <- combineTranscripts(tx_broken_filtered, kg_consensus_filtered) export( tx_final, con = paste(args$outprefix, ".final.transcripts.bed", sep = "") From 5c6b24aa2ceca66df9e3218163afe902fb990c43 Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Mon, 21 Oct 2024 10:17:32 -0500 Subject: [PATCH 53/54] style(grohmm): Clean up Editorconfig, styler, consistant naming --- .editorconfig | 6 +- bin/custom_makeConsensusAnnotations.R | 285 ------------------ ...eter_tuning.R => grohmm_parametertuning.R} | 4 +- ...ng_grohmm.R => grohmm_transcriptcalling.R} | 27 +- modules/local/grohmm/parametertuning/main.nf | 2 +- .../local/grohmm/transcriptcalling/main.nf | 2 +- subworkflows/local/grohmm/tests/main.nf.test | 13 +- 7 files changed, 35 insertions(+), 304 deletions(-) delete mode 100644 bin/custom_makeConsensusAnnotations.R rename bin/{parameter_tuning.R => grohmm_parametertuning.R} (96%) rename bin/{transcriptcalling_grohmm.R => grohmm_transcriptcalling.R} (88%) diff --git a/.editorconfig b/.editorconfig index 72dda289..e957201e 100644 --- a/.editorconfig +++ b/.editorconfig @@ -8,7 +8,7 @@ trim_trailing_whitespace = true indent_size = 4 indent_style = space -[*.{md,yml,yaml,html,css,scss,js}] +[*.{md,yml,yaml,html,css,scss,js,R,Rmd}] indent_size = 2 # These files are edited and tested upstream in nf-core/modules @@ -31,3 +31,7 @@ indent_size = unset # ignore python and markdown [*.{py,md}] indent_style = unset + +# Follow tidyverse style for R +[*.{R,Rmd}] +indent_size = 2 diff --git a/bin/custom_makeConsensusAnnotations.R b/bin/custom_makeConsensusAnnotations.R deleted file mode 100644 index f51eda7d..00000000 --- a/bin/custom_makeConsensusAnnotations.R +++ /dev/null @@ -1,285 +0,0 @@ -#' custom_makeConsensusAnnotations Makes a consensus annotation #' -#' Makes a non-overlapping consensus annotation. Gene annotations are often -#' overalpping due to #' multiple isoforms for a gene. -#' In consensus annotation, isoforms are first reduced so that only -#' redundant intervals are used to represent a genomic interval for a gene, -#' i.e., a gene id. -#' Remaining unresolved annotations are further reduced by truncating 3' -#' end of annotations. -#' -#' Supports parallel processing using mclapply in the 'parallel' package. -#' To change the number of processors, use the argument 'mc.cores'. -#' -#' @param ar GRanges of annotations to be collapsed. -#' @param minGap Minimun gap between overlapped annotations after truncated. -#' Default: 1L -#' @param minWidth Minimun width of consensus annotations. Default: 1000L -#' @param ... Extra argument passed to mclapply. -#' @return Returns GRanges object of annotations. -#' @author Minho Chae -#' @examples -#' ## Not run: -#' # library(TxDb.Hsapiens.UCSC.hg19.knownGene) -#' # txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene -#' # tx <- transcripts(txdb, columns=c("gene_id", "tx_id", "tx_name"), -#' filter=list(tx_chrom="chr7")) -#' # tx <- tx[grep("random", as.character(seqnames(tx)), invert=TRUE),] -#' # ca <- makeConsensusAnnotations(tx) -custom_makeConsensusAnnotations <- function(ar, minGap=1L, minWidth=1000L, ...) { - # Check and remove ranges with missing or multiple gene_ids - valid_ranges <- vapply(mcols(ar)$gene_id, function(x) length(x) == 1, logical(1)) - if (any(!valid_ranges)) { - warning(sum(!valid_ranges), " ranges have missing or multiple gene_id and they are dropped") - ar <- ar[valid_ranges] - } - - # Split into single-isoform and multi-isoform genes - gene_counts <- table(mcols(ar)$gene_id) - singles <- ar[mcols(ar)$gene_id %in% names(gene_counts[gene_counts == 1])] - multi_isoform_genes <- names(gene_counts[gene_counts > 1]) - - # Process multi-isoform genes in chunks - chunk_size <- 1000 # Adjust based on available memory - num_chunks <- ceiling(length(multi_isoform_genes) / chunk_size) - - isoforms <- GRangesList() - for (i in 1:num_chunks) { - chunk_start <- (i - 1) * chunk_size + 1 - chunk_end <- min(i * chunk_size, length(multi_isoform_genes)) - chunk_genes <- multi_isoform_genes[chunk_start:chunk_end] - - chunk_isoforms <- ar[mcols(ar)$gene_id %in% chunk_genes] - chunk_list <- split(chunk_isoforms, mcols(chunk_isoforms)$gene_id) - - message("Reduce isoforms (chunk ", i, "/", num_chunks, ") ... ", appendLF=FALSE) - chunk_result <- GRangesList(mclapply(chunk_list, function(x) { - # For mixed strands or chrom, choose the longest - if ((length(seqlevelsInUse(x)) > 1) || - (length(unique(strand(x))) > 1)) { - result <- x[which.max(width(x)), "gene_id"] - } else { - dx <- disjoin(x) - mcols(dx)$gene_id <- mcols(x)$gene_id[1] - olcnt <- countOverlaps(dx, x) - - multi <- dx[olcnt > 1] # Use the disjoint ranges - # covered more than once - if (length(multi) == 0) { # For non-overlapping isoforms, - # choose the longest - result <- x[which.max(width(x)), "gene_id"] - } else if (length(multi) == 1) { - result <- multi - } else { - reduced <- reduce(multi) - if (length(reduced) == 1) - result <- reduced - else (length(reduced) > 1) - result <- reduced[which.max(width(reduced)),] - - } - mcols(result)$gene_id <- mcols(x)$gene_id[1] - } - return(result) - }, ...)) - isoforms <- c(isoforms, chunk_result) - message("OK") - } - isoforms <- unlist(isoforms) - - # Check redundancy - isoforms <- removeRedundant(isoforms) - singles <- removeRedundant(singles) - - o <- findOverlaps(singles, isoforms, type = "equal") - if (length(o) != 0) { - singles <- singles[-queryHits(o), ] - } - - o <- findOverlaps(singles, isoforms, type = "within") - if (length(o) != 0) { - singles <- singles[-queryHits(o), ] - } - - o <- findOverlaps(isoforms, singles, type = "within") - if (length(o) != 0) { - isoforms <- isoforms[-queryHits(o), ] - } - - noiso <- sort(c(isoforms, singles[, "gene_id"])) - message("Truncate overlapped ranges ... ", appendLF=FALSE) - # with different gene_ids - while(!isDisjoint(noiso)) { - ol <- findOverlaps(noiso, drop.self=TRUE, drop.redundant=TRUE) - ol_gr <- GRangesList(lapply(1:length(ol), function(x) { - sort(c( - noiso[queryHits(ol)[x]], - noiso[subjectHits(ol)[x]] - )) - })) - - # Truncate 3' end - ol_gr <- unlist(endoapply(ol_gr, function(x) { - if (as.character(strand(x[1, ])) == "+") { - end(x[1, ]) <- start(x[2, ]) - minGap - # first range's end is truncated - } else { - start(x[2, ]) <- end(x[1, ]) + minGap - # sencond range's end is truncated - } - x - })) - - # Remove any ranges with duplicated names since they already adujsted - # in the previous call - ol_gr <- ol_gr[!duplicated(names(ol_gr)), ] - - noiso <- noiso[-unique(c(queryHits(ol), subjectHits(ol))), ] - # update noiso - noiso <- c(noiso, ol_gr) - } - message("OK") - - noiso <- noiso[width(noiso) >= minWidth, ] - return(sort(noiso)) -} - -removeRedundant <- function(annox) { - o <- findOverlaps(annox, drop.self=TRUE, type="equal", - drop.redundant=TRUE) - if(length(o) != 0) - annox <- annox[-subjectHits(o),] - - o <- findOverlaps(annox, drop.self=TRUE, type="within", - drop.redundant=TRUE) - if(length(o) != 0) - annox <- annox[-queryHits(o),] - - return(annox) -} - - - - -## TESTING -suppressPackageStartupMessages(library(groHMM)) -suppressPackageStartupMessages(library(testthat)) -suppressPackageStartupMessages(library(GenomicRanges)) -suppressPackageStartupMessages(library(rtracklayer)) -library(GenomicFeatures) - -# test_that("custom_makeConsensusAnnotations maintains accuracy", { -# # Load the GTF file -# gtf_url <- "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf" -# gtf <- import(gtf_url) - -# # Extract transcripts from the GTF -# transcripts <- gtf[gtf$type == "transcript"] - -# # Create a sample GRanges object -# set.seed(42) -# n <- 100 -# sample_ranges <- transcripts -# print("Sample ranges:") -# print(sample_ranges) - -# # Run both original and refactored functions -# original_result <- groHMM::makeConsensusAnnotations(sample_ranges) -# refactored_result <- custom_makeConsensusAnnotations(sample_ranges) - -# print("Refactored result:") -# print(refactored_result) - -# # Compare results -# expect_equal(length(original_result), length(refactored_result)) -# expect_equal(sum(width(original_result)), sum(width(refactored_result))) - -# if (length(original_result) > 0 && length(refactored_result) > 0) { -# expect_equal( -# sort(unique(mcols(original_result)$gene_id)), -# sort(unique(mcols(refactored_result)$gene_id)) -# ) -# } else { -# print("Warning: One or both results are empty.") -# } - -# # Additional detailed comparisons -# print(paste("Original result length:", length(original_result))) -# print(paste("Refactored result length:", length(refactored_result))) -# print(paste("Original result total width:", sum(width(original_result)))) -# print(paste("Refactored result total width:", sum(width(refactored_result)))) -# print(paste("Original result unique gene_ids:", length(unique(mcols(original_result)$gene_id)))) -# print(paste("Refactored result unique gene_ids:", length(unique(mcols(refactored_result)$gene_id)))) - -# # Compare the first few entries -# print("First 5 entries of original result:") -# print(head(original_result, 5)) -# print("First 5 entries of refactored result:") -# print(head(refactored_result, 5)) -# }) - -gff <- "./chm13v2.0_RefSeq_Liftoff_v5.1.gff3.gz" -kg_db <- makeTxDbFromGFF(gff) -kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name")) -print("Collapse annotations in preparation for overlap") -kg_consensus <- makeConsensusAnnotations( - kg_tx, - mc.cores = 2 -) -print("Kg consensus:") -print(kg_consensus) - -test_that("chm13 custom_makeConsensusAnnotations maintains accuracy", { - # Load the GTF file - gff_url <- "./chm13v2.0_RefSeq_Liftoff_v5.1.gff3.gz" - gff <- import(gff_url) - - print("GFF:") - print(gff) - # Extract transcripts from the GTF - transcripts <- gff[gff$type == "transcript"] - print("Transcripts:") - print(transcripts) - - # Create a sample GRanges object - set.seed(42) - n <- 100 - sample_ranges <- transcripts - print("Sample ranges:") - print(sample_ranges) - - # Run both original and refactored functions - original_result <- groHMM::makeConsensusAnnotations(sample_ranges) - refactored_result <- custom_makeConsensusAnnotations(sample_ranges) - - print("Original result:") - print(original_result) - print("Refactored result:") - print(refactored_result) - - # Compare results - expect_equal(length(original_result), length(refactored_result)) - expect_equal(sum(width(original_result)), sum(width(refactored_result))) - - if (length(original_result) > 0 && length(refactored_result) > 0) { - expect_equal( - sort(unique(mcols(original_result)$gene_id)), - sort(unique(mcols(refactored_result)$gene_id)) - ) - } else { - print("Warning: One or both results are empty.") - } - - # Additional detailed comparisons - print(paste("Original result length:", length(original_result))) - print(paste("Refactored result length:", length(refactored_result))) - print(paste("Original result total width:", sum(width(original_result)))) - print(paste("Refactored result total width:", sum(width(refactored_result)))) - print(paste("Original result unique gene_ids:", length(unique(mcols(original_result)$gene_id)))) - print(paste("Refactored result unique gene_ids:", length(unique(mcols(refactored_result)$gene_id)))) - - # Compare the first few entries - print("First 5 entries of original result:") - print(head(original_result, 5)) - print("First 5 entries of refactored result:") - print(head(refactored_result, 5)) -}) diff --git a/bin/parameter_tuning.R b/bin/grohmm_parametertuning.R similarity index 96% rename from bin/parameter_tuning.R rename to bin/grohmm_parametertuning.R index de323bf5..482b8741 100755 --- a/bin/parameter_tuning.R +++ b/bin/grohmm_parametertuning.R @@ -119,8 +119,8 @@ tune <- data.frame( LtProbB = args$ltprobb, UTS = args$uts ) -Fp <- windowAnalysis(alignments, strand = "+", windowSize = 50) -Fm <- windowAnalysis(alignments, strand = "-", windowSize = 50) +fp <- windowAnalysis(alignments, strand = "+", windowSize = 50) +fm <- windowAnalysis(alignments, strand = "-", windowSize = 50) hmm <- detectTranscripts( Fp = Fp, Fm = Fm, diff --git a/bin/transcriptcalling_grohmm.R b/bin/grohmm_transcriptcalling.R similarity index 88% rename from bin/transcriptcalling_grohmm.R rename to bin/grohmm_transcriptcalling.R index 3f362275..cdc4ca6b 100755 --- a/bin/transcriptcalling_grohmm.R +++ b/bin/grohmm_transcriptcalling.R @@ -145,7 +145,8 @@ write.table( print("Input transcript annotations") kg_db <- makeTxDbFromGFF(args$gxf) kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name")) -# TODO I wonder if I could speed things up by filtering by chromosome at the Nextflow level +# TODO I wonder if I could speed things up by filtering +# by chromosome at the Nextflow level... # https://github.com/google/deepvariant/issues/744 # filter=list(tx_chrom="chr7")) # exclude any transcripts that are located on chromosomes labeled with "random". @@ -170,8 +171,10 @@ get_expressed_annotations <- function(features, reads) { f_limit <- limitToXkb(features) count <- countOverlaps(f_limit, reads) features <- features[count != 0, ] - return(features[(quantile(width(features), .05) < width(features)) & - (width(features) < quantile(width(features), .95)), ]) + return(features[ + (quantile(width(features), .05) < width(features)) & + (width(features) < quantile(width(features), .95)), + ]) } con_expressed <- get_expressed_annotations( features = kg_consensus, @@ -181,15 +184,20 @@ b_plus <- breakTranscriptsOnGenes(tx_hmm, kg_consensus, strand = "+") b_minus <- breakTranscriptsOnGenes(tx_hmm, kg_consensus, strand = "-") tx_broken <- c(b_plus, b_minus) # Assign unique IDs if they're missing -if (is.null(mcols(tx_broken)$transcript_id) || any(is.na(mcols(tx_broken)$transcript_id))) { - mcols(tx_broken)$transcript_id <- paste0("TX", seq_along(tx_broken)) +if ( + is.null(mcols(tx_broken)$transcript_id) || + any(is.na(mcols(tx_broken)$transcript_id)) +) { + mcols(tx_broken)$transcript_id <- paste0("TX", seq_along(tx_broken)) } # Filter out any transcripts with NA values in start or end positions -tx_broken_filtered <- tx_broken[!is.na(start(tx_broken)) & !is.na(end(tx_broken))] +tx_broken_filtered <- + tx_broken[!is.na(start(tx_broken)) & !is.na(end(tx_broken))] # Ensure that kg_consensus also doesn't contain NA values -kg_consensus_filtered <- kg_consensus[!is.na(start(kg_consensus)) & !is.na(end(kg_consensus))] +kg_consensus_filtered <- + kg_consensus[!is.na(start(kg_consensus)) & !is.na(end(kg_consensus))] # Now call combineTranscripts with the filtered data tx_final <- combineTranscripts(tx_broken_filtered, kg_consensus_filtered) @@ -212,7 +220,10 @@ capture.output(td_final, file = paste0(args$outprefix, ".tdFinal.txt")) # Write the data used in the plot to a CSV file data_to_write <- data.frame(x = td_final$x, profile = td_final$profile) -write.csv(data_to_write, file = paste0(args$outprefix, ".tdFinal_mqc.csv"), row.names = FALSE) +write.csv(data_to_write, + file = paste0(args$outprefix, ".tdFinal_mqc.csv"), + row.names = FALSE +) ######################## ## CITE PACKAGES USED ## diff --git a/modules/local/grohmm/parametertuning/main.nf b/modules/local/grohmm/parametertuning/main.nf index ad5ac5c7..0d2a5069 100644 --- a/modules/local/grohmm/parametertuning/main.nf +++ b/modules/local/grohmm/parametertuning/main.nf @@ -26,7 +26,7 @@ process GROHMM_PARAMETERTUNING { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}_${UTS}_${LtProbB}" """ - parameter_tuning.R \\ + grohmm_parametertuning.R \\ --bam_file ${bams} \\ --outprefix ${prefix} \\ --gxf $gxf \\ diff --git a/modules/local/grohmm/transcriptcalling/main.nf b/modules/local/grohmm/transcriptcalling/main.nf index 07cba0ca..984d1800 100644 --- a/modules/local/grohmm/transcriptcalling/main.nf +++ b/modules/local/grohmm/transcriptcalling/main.nf @@ -28,7 +28,7 @@ process GROHMM_TRANSCRIPTCALLING { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" """ - transcriptcalling_grohmm.R \\ + grohmm_transcriptcalling.R \\ --bam_file ${bams} \\ --tuning_file ${tuning_file} \\ --outprefix ${prefix} \\ diff --git a/subworkflows/local/grohmm/tests/main.nf.test b/subworkflows/local/grohmm/tests/main.nf.test index 29382e3f..1cbb283c 100644 --- a/subworkflows/local/grohmm/tests/main.nf.test +++ b/subworkflows/local/grohmm/tests/main.nf.test @@ -17,15 +17,15 @@ nextflow_workflow { workflow { """ input[0] = Channel.of([ - [ id: 'Sall' ], + [ id: 'Sall' ], [ file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S0mR1.bam", checkIfExists: true), file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S40mR1.bam", checkIfExists: true) ], [], - ]) + ]) input[1] = Channel.of([file( "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf", checkIfExists: true - )]).first() + )]).first() """ } } @@ -41,9 +41,10 @@ nextflow_workflow { // workflow.out.td_plot, ).match() }, - { assert snapshot( - path(workflow.out.versions.get(0)).yaml) - .match("versions") + { + assert snapshot( + path(workflow.out.versions.get(0)).yaml + ).match("versions") }, ) } From 9c651138468061bc881c9bb76e8b17d9e63bd61f Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Tue, 22 Oct 2024 12:15:33 -0500 Subject: [PATCH 54/54] chore: Move test data to test-datasets repo --- modules/local/grohmm/parametertuning/tests/chm13_gtf.nf.test | 2 +- modules/local/grohmm/parametertuning/tests/main.nf.test | 2 +- modules/local/grohmm/transcriptcalling/tests/main.nf.test | 4 ++-- subworkflows/local/grohmm/tests/main.nf.test | 2 +- .../transcript_indentification/grohmm/only_gff/main.nf.test | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/local/grohmm/parametertuning/tests/chm13_gtf.nf.test b/modules/local/grohmm/parametertuning/tests/chm13_gtf.nf.test index 0d27d847..3b30e5ce 100644 --- a/modules/local/grohmm/parametertuning/tests/chm13_gtf.nf.test +++ b/modules/local/grohmm/parametertuning/tests/chm13_gtf.nf.test @@ -19,7 +19,7 @@ nextflow_process { [], ] input[1] = file( - "https://gist.githubusercontent.com/edmundmiller/f9a31e300a90956d8aaff7ad6105e394/raw/99f6eff1ddb8ca9ac1cd766ea2fed9bb83919fb2/broke.gtf", + "https://raw.githubusercontent.com/nf-core/test-datasets/nascent/reference/broke.gtf", checkIfExists: true ) input[2] = 5 diff --git a/modules/local/grohmm/parametertuning/tests/main.nf.test b/modules/local/grohmm/parametertuning/tests/main.nf.test index eafdb9d9..09ae26fa 100644 --- a/modules/local/grohmm/parametertuning/tests/main.nf.test +++ b/modules/local/grohmm/parametertuning/tests/main.nf.test @@ -19,7 +19,7 @@ nextflow_process { [], ] input[1] = file( - "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf", + "https://raw.githubusercontent.com/nf-core/test-datasets/nascent/reference/hg19.chr7.refGene.gtf", checkIfExists: true ) input[2] = 5 diff --git a/modules/local/grohmm/transcriptcalling/tests/main.nf.test b/modules/local/grohmm/transcriptcalling/tests/main.nf.test index 01a12c62..06e8750f 100644 --- a/modules/local/grohmm/transcriptcalling/tests/main.nf.test +++ b/modules/local/grohmm/transcriptcalling/tests/main.nf.test @@ -20,7 +20,7 @@ nextflow_process { [], ] input[1] = file( - "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf", + "https://raw.githubusercontent.com/nf-core/test-datasets/nascent/reference/hg19.chr7.refGene.gtf", checkIfExists: true ) """ @@ -50,7 +50,7 @@ nextflow_process { file("${projectDir}/tests/config/tuningparams_small.csv", checkIfExists: true), ] input[1] = file( - "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf", + "https://raw.githubusercontent.com/nf-core/test-datasets/nascent/reference/hg19.chr7.refGene.gtf", checkIfExists: true ) """ diff --git a/subworkflows/local/grohmm/tests/main.nf.test b/subworkflows/local/grohmm/tests/main.nf.test index 1cbb283c..39f9f380 100644 --- a/subworkflows/local/grohmm/tests/main.nf.test +++ b/subworkflows/local/grohmm/tests/main.nf.test @@ -23,7 +23,7 @@ nextflow_workflow { [], ]) input[1] = Channel.of([file( - "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf", + "https://raw.githubusercontent.com/nf-core/test-datasets/nascent/reference/hg19.chr7.refGene.gtf", checkIfExists: true )]).first() """ diff --git a/workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test b/workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test index 4e8ab43d..362f87d5 100644 --- a/workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test +++ b/workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test @@ -14,7 +14,7 @@ nextflow_pipeline { outdir = "$outputDir" skip_grohmm = false gff = 'https://huggingface.co/datasets/edmundmiller/nascent-test-data/resolve/main/chm13v2.0_RefSeq_Liftoff_v5.1.chr21.gff3.gz' - // TODO Use CHM13 Fasta + fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nascent/reference/chm13v2.0.chr21.fa.gz' gtf = null bed = null }