Skip to content

Commit

Permalink
Merge pull request #128 from mskcc/enhancement/hg38_update
Browse files Browse the repository at this point in the history
Enhancement/hg38 update
  • Loading branch information
anoronh4 authored Jan 13, 2025
2 parents c094073 + fb2f514 commit d38564e
Show file tree
Hide file tree
Showing 61 changed files with 4,306 additions and 384 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- [#118](https://github.com/mskcc/forte/pull/118) - change the way the plug-n-play starfusion reference is downloaded.

- [#128](https://github.com/mskcc/forte/pull/128) - full support for GRCh38 added

### `Fixed`

- [#119](https://github.com/mskcc/forte/pull/119) - change script error behavior in METAFUSION_RUN process
Expand Down
19 changes: 9 additions & 10 deletions bin/final_generate_v75_gene_bed.R → bin/generate_gene_bed.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# __author__ = "Alexandria Dymun"
# __email__ = "pintoa1@mskcc.org"
# __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)"
# __version__ = "0.0.1"
# __version__ = "0.0.2"
# __status__ = "Dev"


Expand All @@ -12,11 +12,12 @@ suppressPackageStartupMessages({
library(dplyr)
library(data.table)
library(stringr)
options(scipen = 999)
})

usage <- function() {
message("Usage:")
message("final_generate_v75_gene_bed.R <in.gff> <out.bed>")
message("generate_gene_bed.R <in.gff> <out.bed>")
}

args = commandArgs(TRUE)
Expand All @@ -26,15 +27,10 @@ if (length(args)!=2) {
quit()
}

# Utilized gtf from igenomes for FORTE This corresponds to GRCh37 ensembl 75
# Add introns to gtf, convert to gff3
# bsub -R "rusage[mem=64]" -o add_introns_agat_%J.out singularity exec -B /juno/ \\
# -B /tmp -B /scratch/ docker://quay.io/biocontainers/agat:0.8.0--pl5262hdfd78af_0 \\
# /bin/bash -c "agat_sp_add_introns.pl -g /juno/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf\\
# -o genes.INTRONS.gff3"

gtf <- rtracklayer::import(args[1])
gtf_df <- as.data.frame(gtf)
#remove incomplete transcripts mRNA_end_NF and mRNA_start_NF (not finished)
gtf_df <- gtf_df[!grepl("NF",gtf_df$tag),]

file.to_write <- args[2]

Expand All @@ -44,7 +40,8 @@ gtf_df <- gtf_df %>%
chr = seqnames
) %>%
select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>%
filter(type %in% c("exon","intron","UTR","CDS","cds","utr")) %>% mutate(start = start-1)
filter(type %in% c("exon","intron","UTR","CDS","cds","utr","five_prime_utr","three_prime_utr")) %>%
mutate(gene_name = ifelse(is.na(gene_name),gene_id,gene_name)) %>% mutate(start = start-1)


#START CLOCK
Expand Down Expand Up @@ -106,6 +103,8 @@ modify_transcript <- function(transcript){
transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr5"
}
}
transcript$type[transcript$type == "five_prime_utr"] <- "utr5"
transcript$type[transcript$type == "three_prime_utr"] <- "utr3"
#### Any exon that remains after teh cds change, is likely and untranslated region. change below

# Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5)
Expand Down
2 changes: 1 addition & 1 deletion bin/make_gene_info_for_forte.R
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ gene_info <- rbind(gene_info,add_these_excess_gene_ids)
gene_info <- merge(gene_info,do.call(rbind,unique_id_to_names[versioned_gtf])[,c("gene_id","gene_id_with_version")],by = "gene_id",all.x = T, all.y = F)

gene_info$Synonyms <- ifelse(is.na(gene_info$gene_id_with_version),gene_info$gene_id,paste0(gene_info$gene_id,"|",gene_info$gene_id_with_version))
gene_info$Symbol <- gene_info$gene_name
gene_info$Symbol <- ifelse(is.na(gene_info$gene_name), gene_info$gene_id, gene_info$gene_name)

gene_info <- gene_info[,c("Symbol","Synonyms")]

Expand Down
21 changes: 15 additions & 6 deletions conf/igenomes.config
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,21 @@ params {
ensembl_version = 75
}
'GRCh38' {
fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa"
gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.gtf"
refflat = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes.gencode/refFlat.txt.gz"
starfusion_url = "https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.10/GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz"
cdna = "https://ftp.ensembl.org/pub/release-86/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz"
ensembl_version = 111
fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38Decoy/Sequence/WholeGenomeFasta/genome.fa"
//fasta = "https://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"
gtf = "https://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.GRCh38.111.gtf.gz"
//forte will generate refflat from gtf
refflat = null
starfusion_url = "https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.10/GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz"
cdna = "https://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz"
metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh38/blocklist_breakpoints.hg38.bedpe.gz"
baits {
'idt_v2' {
targets = "/juno/work/ccs/cmopipeline/forte/GRCh38_probes/xgen-exome-hyb-panel-v2-targets-hg38.bed"
baits = "/juno/work/ccs/cmopipeline/forte/GRCh38_probes/xgen-exome-hyb-panel-v2-probes-hg38.bed"
}
}
}
'smallGRCh37' {
fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta"
Expand All @@ -48,7 +58,6 @@ params {
cdna = "http://ftp.ensemblgenomes.org/pub/viruses/fasta/sars_cov_2/cdna/Sars_cov_2.ASM985889v3.cdna.all.fa.gz"
metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/blocklist_breakpoints.bedpe"
ensembl_version = 75

}
/*
'hg38' {
Expand Down
11 changes: 10 additions & 1 deletion conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,14 @@ process {
]
}

withName: '.*:PREPARE_REFERENCES:GUNZIP.*' {
storeDir = { "${params.reference_base}/${params.genome}/${task.process.tokenize(':')[-1].toLowerCase()}" }
}

withName: 'FASTAREMOVEPREFIX' {
storeDir = { "${params.reference_base}/${params.genome}/fasta" }
}

withName: 'MSKCC_FORTE:FORTE:MULTIQC' {
publishDir = [
path: { "${report.folder}/report" },
Expand Down Expand Up @@ -210,6 +218,7 @@ process {
]
}
withName: 'AGAT_SPADDINTRONS' {
cpus = { 4 * task.attempt }
storeDir = { "${params.reference_base}/${params.genome}/metafusion/introns" }
publishDir = [
enabled: false,
Expand Down Expand Up @@ -475,7 +484,7 @@ process {
]
}

withName: ARRIBA {
withName: ARRIBA_ARRIBA {
ext.args = {
"-s ${meta.single_end || meta.strandedness == "forward" ? "yes" : meta.strandedness == "reverse" ? "reverse" : "no" }"
}
Expand Down
17 changes: 9 additions & 8 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@
"git_sha": "6898156da3604a6bdf26c36036053a970050fea0",
"installed_by": ["modules"]
},
"arriba": {
"arriba/arriba": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
"installed_by": ["modules"]
},
"cat/cat": {
Expand All @@ -48,12 +48,12 @@
},
"gatk4/bedtointervallist": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
"installed_by": ["modules"]
},
"gatk4/createsequencedictionary": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
"installed_by": ["modules"]
},
"gunzip": {
Expand Down Expand Up @@ -134,7 +134,7 @@
},
"samtools/faidx": {
"branch": "master",
"git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
"git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
"installed_by": ["modules"]
},
"samtools/index": {
Expand All @@ -149,13 +149,14 @@
},
"star/align": {
"branch": "master",
"git_sha": "57d75dbac06812c59798a48585032f6e50bb1914",
"git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
"installed_by": ["modules"]
},
"star/genomegenerate": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"installed_by": ["modules"]
"git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
"installed_by": ["modules"],
"patch": "modules/nf-core/star/genomegenerate/star-genomegenerate.diff"
},
"subread/featurecounts": {
"branch": "master",
Expand Down
4 changes: 2 additions & 2 deletions modules/local/agfusion/batch/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ process AGFUSION_BATCH {
// Note: 2.7X indices incompatible with AWS iGenomes.
conda 'bioconda::agfusion=1.252'
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'docker://cmopipeline/agfusion:0.0.6' :
'docker.io/cmopipeline/agfusion:0.0.6' }"
'docker://cmopipeline/agfusion:0.0.7' :
'docker.io/cmopipeline/agfusion:0.0.7' }"

input:
tuple val(meta), path(fusions)
Expand Down
27 changes: 21 additions & 6 deletions modules/local/agfusion/container/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,14 +1,30 @@
FROM ubuntu:bionic-20230530
FROM ubuntu:jammy-20240911.1

LABEL maintainer="Anne Marie Noronha (noronhaa@mskcc.org)" \
version.image="0.0.6"
version.image="0.0.7"

# INSTALL DEPENDENCIES

ENV DEBIAN_FRONTEND=noninteractive

RUN apt-get update -y
RUN apt-get install -y build-essential python3 python3-pip python3-matplotlib python3-pandas python3-future python3-biopython curl less vim libnss-sss git zip
RUN apt-get install -y \
build-essential \
python3 \
python3-pip \
python3-matplotlib \
python3-pandas \
python3-future \
python3-biopython \
python3-dev \
default-libmysqlclient-dev \
pkg-config \
curl \
less \
vim \
libnss-sss \
git \
zip
RUN pip3 install --upgrade pip
RUN pip3 install pyensembl

Expand All @@ -18,9 +34,8 @@ RUN pip3 install mysqlclient

# INSTALL AGFUSION & DATABASE FILES
WORKDIR /usr/local/bin
RUN git clone https://github.com/mskcc/AGFusion.git --branch v1.4.1-fork1 --single-branch
RUN git clone https://github.com/mskcc/AGFusion.git --branch v1.4.3@mskcc.1 --single-branch
WORKDIR /usr/local/bin/AGFusion
RUN pip3 install -r requirements.txt
RUN pip3 install .

# downgrade pyensembl for compatibility
RUN pip3 install gtfparse==1.2.1 --upgrade
10 changes: 5 additions & 5 deletions modules/local/agfusion/download/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ process AGFUSION_DOWNLOAD {
// Note: 2.7X indices incompatible with AWS iGenomes.
conda 'bioconda::agfusion=1.252'
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'docker://cmopipeline/agfusion:0.0.6' :
'docker.io/cmopipeline/agfusion:0.0.6' }"
'docker://cmopipeline/agfusion:0.0.7' :
'docker.io/cmopipeline/agfusion:0.0.7' }"

input:
val(ensembl_release)
Expand All @@ -25,13 +25,13 @@ process AGFUSION_DOWNLOAD {
['GRCh38','hg38'].contains(genome) ? 'hg38' :
['GRCm38','mm10'].contains(genome) ? 'mm10' : ''
def pyensembl_species = ['GRCm38','mm10'].contains(genome) ? 'mus_musculus' : 'homo_sapiens'
if (ensembl_release < 93) {
if (ensembl_release < 112) {
"""
export PYENSEMBL_CACHE_DIR=\$PWD/pyensembl_cache
pyensembl install --species ${pyensembl_species} --release ${ensembl_release}
agfusion download -g ${agfusion_genome}
agfusion download -s ${pyensembl_species} -r ${ensembl_release}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand All @@ -44,7 +44,7 @@ process AGFUSION_DOWNLOAD {
pyensembl install --species ${pyensembl_species} --release ${ensembl_release}
curl http://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/database_files/pfamA.txt.gz > pfamA.txt.gz
curl http://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam37.0/database_files/pfamA.txt.gz > pfamA.txt.gz
gunzip pfamA.txt.gz
agfusion build --dir . --species ${agfusion_genome} --release ${ensembl_release} --pfam pfamA.txt
rm pfamA.txt
Expand Down
5 changes: 5 additions & 0 deletions modules/local/fastaremoveprefix/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
channels:
- conda-forge
- bioconda
dependencies:
- conda-forge::gawk=5.3.0
32 changes: 32 additions & 0 deletions modules/local/fastaremoveprefix/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
process FASTAREMOVEPREFIX {
tag "$fasta"
label 'process_single'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/gawk:5.3.0' :
'biocontainers/gawk:5.3.0' }"

when:
task.ext.when == null || task.ext.when

input:
tuple val(meta), path(fasta, name: 'input/*')

output:
tuple val(meta), path("*.{fa,fasta}"), emit: fasta
path "versions.yml" , emit: versions

script:
def modified_fasta = fasta.fileName.name
"""
cat ${fasta} | sed "s/^>chr/>/g" | sed "s/^>M />MT /g" > ${modified_fasta}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//')
END_VERSIONS
"""


}
13 changes: 7 additions & 6 deletions modules/local/metafusion/genebed/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,9 @@ process METAFUSION_GENEBED {

input:
tuple val(meta), path(gff)
val ensembl_version

output:
tuple val(meta), path("*.metafusion.gene.bed"), emit: metafusion_gene_bed
tuple val(meta), path("${meta.id}.metafusion.gene.bed"), emit: metafusion_gene_bed
path "versions.yml" , emit: versions

when:
Expand All @@ -22,27 +21,29 @@ process METAFUSION_GENEBED {
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
final_generate_v75_gene_bed.R \\
generate_gene_bed.R \\
$gff \\
${ensembl_version}.metafusion.gene.bed
${prefix}.metafusion.gene.bed
cat <<-END_VERSIONS > versions.yml
"${task.process}":
R: \$(R --version | head -n1)
final_generate_v75_gene_bed.R: 0.0.1
generate_gene_bed.R: 0.0.2
END_VERSIONS
"""

stub:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"

"""
touch ${prefix}.metafusion.gene.bed
cat <<-END_VERSIONS > versions.yml
"${task.process}":
R: \$(R --version | head -n1)
final_generate_v75_gene_bed.R: 0.0.1
generate_gene_bed.R: 0.0.2
END_VERSIONS
"""

}
Loading

0 comments on commit d38564e

Please sign in to comment.