- test/inputs/libraries.tsv
- test/inputs/cna_libraries.tsv
- Get my fastq inputs for read process & testing
#!/bin/echo For documentation, not intended to be executable:. mntpt=/mnt/ris/aadel/Active singularity shell --bind ${mntpt} ~/sing_containers/biotools.1.0.2.sif repo=/home/jeszyman/repos/cfdna-wgs mntpt=/mnt/ris/aadel/Active mkdir -p ${repo}/test/inputs fq_size=100000 zcat ${mntpt}/mpnst/inputs/seq/MPNST/19_2_082_R1.fastq.gz | head -n $fq_size > ${repo}/test/inputs/mpnst1_R1.fastq zcat ${mntpt}/mpnst/inputs/seq/MPNST/19_2_082_R2.fastq.gz | head -n $fq_size > ${repo}/test/inputs/mpnst1_R2.fastq zcat ${mntpt}/mpnst/inputs/seq/MPNST/25_2_072_R1.fastq.gz | head -n $fq_size > ${repo}/test/inputs/mpnst2_R1.fastq zcat ${mntpt}/mpnst/inputs/seq/MPNST/25_2_072_R2.fastq.gz | head -n $fq_size > ${repo}/test/inputs/mpnst2_R2.fastq zcat ${mntpt}/mpnst/inputs/seq/PN/37_JS0050CD112717_R1.fastq.gz | head -n $fq_size > ${repo}/test/inputs/plex1_R1.fastq zcat ${mntpt}/mpnst/inputs/seq/PN/37_JS0050CD112717_R2.fastq.gz | head -n $fq_size > ${repo}/test/inputs/plex1_R2.fastq zcat ${mntpt}/mpnst/inputs/seq/PN/30_JS0044CD112818_R1.fastq.gz | head -n $fq_size > ${repo}/test/inputs/plex2_R1.fastq zcat ${mntpt}/mpnst/inputs/seq/PN/30_JS0044CD112818_R2.fastq.gz | head -n $fq_size > ${repo}/test/inputs/plex2_R2.fastq for file in ${repo}/test/inputs/*.fastq; do gzip -f $file; done
- Get bams for CNA and frag testing
mntpt=/mnt/ris/aadel/Active singularity shell --bind ${mntpt} ~/sing_containers/biotools.1.0.2.sif repo=/home/jeszyman/repos/cfdna-wgs mntpt=/mnt/ris/aadel/Active mkdir -p ${repo}/test/inputs # Create small bam files to store in repo. Subsample real bams to ~100 Mb. sambamba view -s .005 -f bam -t 36 /mnt/ris/aadel/Active/mpnst/test/bam/new_HiSeq15_L002001_ACAC_extract_ds20.bam | samtools sort -@4 -n -o test/inputs/lib003_hg38.bam - sambamba view -s .005 -f bam -t 36 /mnt/ris/aadel/Active/mpnst/test/bam/new_HiSeq15_L002001_ATCG_extract_ds20.bam > test/bam/lib004_hg38.bam sambamba view -s 0.01 -f bam -t 4 /mnt/ris/aadel/Active/mpnst/bam/cfdna_wgs/ds/lib105_ds10.bam > test/inputs/lib005.bam sambamba view -s 0.01 -f bam -t 4 /mnt/ris/aadel/Active/mpnst/bam/cfdna_wgs/ds/lib205_ds10.bam > test/inputs/lib006.bam for file in test/inputs/*.bam; do samtools index $file; done
FROM jeszyman/biotools:1.0.2
#################
### Notes ###
#################
#
# After build, the image will be pushed to the dockerhub as
# jeszyman/cfdna_wgs
# (https://hub.docker.com/repository/docker/jeszyman/cfdna_wgs)
#
RUN cd /opt && \
git clone https://github.com/shahcompbio/hmmcopy_utils.git && \
cd hmmcopy_utils && \
cmake . && \
make
#
# ichorCNA
##
## linux dependencies
RUN apt-get update \
&& apt-get install -y \
libcurl4-openssl-dev \
libssl-dev \
libxml2-dev
#RUN rm /usr/lib/x86_64-linux-gnu/libcurl.so.4
#RUN ln -s /usr/lib/x86_64-linux-gnu/libcurl.so.4.5.0 /usr/lib/x86_64-linux-gnu/libcurl.so.4
##
## R dependencies
RUN R -e 'install.packages("BiocManager"); BiocManager::install(); BiocManager::install("HMMcopy"); BiocManager::install("GenomeInfoDb"); BiocManager::install("GenomicRanges");'
##
## git clone install
RUN cd /opt \
&& git clone https://github.com/broadinstitute/ichorCNA.git \
&& cd ichorCNA \
&& R CMD INSTALL . \
&& cd /opt
##
### Parameters intended to be common across workflows ###
blklist: "test/inputs/hg38-blacklist.v2.bed"
datadir: "test"
default_container: "/home/jeszyman/sing_containers/biotools.1.0.2.sif"
genome_fasta: "test/inputs/chr8.fa"
genome_ref: "test/ref/chr8.fa"
qcdir: "test/analysis/qc"
threads: 4
### Unique properties from this repo ###
cfdna_wgs_container: "/home/jeszyman/sing_containers/cfdna_wgs.1.0.0.sif"
cfdna_wgs_repo: "/home/jeszyman/repos/cfdna-wgs"
downsample:
- "0.001"
- "0.004"
frag_distro: "90_150"
gc5mb: "test/inputs/gc5mb.bed"
picard_jar: "/opt/picard/picard.jar"
INPROCESS Integration testing snakefile wrapper
##################################################################
### Integration testing snakefile for WGS cfDNA Processing ###
##################################################################
import pandas as pd
import re
import numpy as np
# Values directly from configuration file
DOWNSAMPLE = config["downsample"]
threads = config["threads"]
FRAG_DISTROS = config["frag_distro"]
cfdna_wgs_threads = config["threads"]
default_container = config["default_container"]
cfdna_wgs_container = config["cfdna_wgs_container"]
genome_fasta = config["genome_fasta"]
genome_ref = config["genome_ref"]
cfdna_wgs_repo = config["cfdna_wgs_repo"]
# Directory values derived from datadir in configuration YAML
datadir = config["datadir"]
cfdna_wgs = datadir + "/analysis/cfdna_wgs"
cfdna_wgs_bams = datadir + "/analysis/cfdna_wgs/bams"
cfdna_wgs_fastqs = datadir + "/analysis/cfdna_wgs/fastqs"
cfdna_wgs_frag = datadir + "/analysis/cfdna_wgs/frag"
cfdna_wgs_frag_beds = datadir + "/analysis/cfdna_wgs/frag/beds"
cfdna_wgs_frag_counts = datadir + "/analysis/cfdna_wgs/frag/counts"
cfdna_wgs_frag_gc_distros = datadir + "/analysis/cfdna_wgs/frag/distros"
qcdir = datadir + "/analysis/qc"
benchdir = datadir + "/benchmark"
logdir = datadir + "/logs"
refdir = datadir + "/ref"
cfdna_wgs_scriptdir = config["cfdna_wgs_repo"] + "/scripts"
### Functions ###
# Setup sample name index as a python dictionary
cfdna_wgs_libraries = pd.read_table(config["datadir"] + "/inputs/libraries.tsv")
readable = []
for x in cfdna_wgs_libraries.file:
readable.append(os.access(x, os.R_OK))
# Ensure readable fastqs
cfdna_wgs_libraries['readable']=readable
cfdna__wgs_libraries = cfdna_wgs_libraries[cfdna_wgs_libraries.readable == True]
# Ensure correct library type per sample sheet
cfdna_wgs_libraries = cfdna_wgs_libraries[cfdna_wgs_libraries.library_type == "wgs"]
cfdna_wgs_libraries = cfdna_wgs_libraries[cfdna_wgs_libraries.isolation_type == "cfdna"]
# Make the dictionary
cfdna_wgs_library_indict = cfdna_wgs_libraries["library"].tolist()
cfdna_wgs_file_indict = cfdna_wgs_libraries["file"].tolist()
cfdna_wgs_lib_dict = dict(zip(cfdna_wgs_library_indict, cfdna_wgs_file_indict))
CFDNA_WGS_LIBRARIES = list(cfdna_wgs_lib_dict.keys())
CFDNA_WGS_FASTQS = list(cfdna_wgs_lib_dict.values())
# Make a list of healthy libraries
CFDNA_WGS_HEALTHY_LIBRARIES = cfdna_wgs_libraries[cfdna_wgs_libraries['cohort'] == 'healthy']['library'].tolist()
rule all:
input:
logdir + "/aggregate_output",
cfdna_wgs_frag + "/ratios.tsv",
qcdir + "/cfdna_wgs_read_qc.tsv",
qcdir + "/cfdna_wgs_frag_len.tsv",
onsuccess:
shell("""
bash {cfdna_wgs_scriptdir}/agg_bench.sh {benchdir} {qcdir}/agg_bench.tsv
""")
# For unit testing
indir="test/benchmark"
output="test/analysis/qc/bench_agg.tsv"
if [ -f $output ]; then rm $output; fi
for file in $indir/*
do
base=$(basename $file)
cat $file | awk -v OFS='\t' -v var=$base 'NR>1 {print var,$0}' >> $output
done
sed -i '1i\process\tfloat_sec\trun_time\tmax_rss\tmax_vms\tmax_uss\tmax_pss\tio_in\tio_out\tmean_load\tcpu_time' $output
library(tidyverse)
bmk_in = read_tsv("~/repos/cfdna-wgs/test/analysis/qc/bench_agg.tsv")
bmk =
bmk_in %>%
mutate(process = gsub(".benchmark.txt", "", process)) %>%
mutate(library = process) %>%
mutate(library = ifelse(grepl("lib[0-9]{3}_", process),
sub("^.*lib(\\d{3}).*$", "lib\\1", process), "all_libs")) %>%
mutate(process2 = process) %>%
mutate(process = gsub("^lib..._","", process)) %>%
rename(process_lib = process2)
find_outlier <- function(x) {
return(x > quantile(x, .75) + 1.5*IQR(x))
}
bmk %>% mutate(outlier = ifelse(find_outlier(run_time), process_lib, NA)) %>%
ggplot(.,aes(y=run_time)) +
geom_boxplot() +
geom_text(aes( y = run_time, x = .1,label=outlier), na.rm=TRUE, position = position_jitter())
bmk %>% mutate(outlier = ifelse(find_outlier(run_time), process_lib, NA)) %>%
ggplot(.,aes( y = run_time)) +
geom_boxplot() +
geom_text(aes( y = run_time, x = .1,label=outlier), na.rm=TRUE, position = position_jitter())
rule symlink_inputs:
container: default_container,
input:
lambda wildcards: cfdna_wgs_lib_dict[wildcards.library],
output:
read1 = cfdna_wgs_fastqs + "/{library}_raw_R1.fastq.gz",
read2 = cfdna_wgs_fastqs + "/{library}_raw_R2.fastq.gz",
params:
outdir = cfdna_wgs_fastqs,
script = cfdna_wgs_scriptdir + "/symlink.sh",
shell:
"""
{params.script} \
{input} \
{output.read1} \
{output.read2} \
{params.outdir}
"""
#!/usr/bin/env bash
set -o errexit # abort on nonzero exitstatus
set -o nounset # abort on unbound variable
set -o pipefail # don't hide errors within pipes
# Script variables
input_read1="${1}"
output_read1="${2}"
output_read2="${3}"
outdir="${4}"
mkdir -p $outdir
input_read2="$(echo $input_read1 | sed "s/_R1/_R2/g")"
ln -sf --relative ${input_read1} ${output_read1}
ln -sf --relative ${input_read2} ${output_read2}
include: cfdna_wgs_repo + "/workflow/reads.smk"
include: cfdna_wgs_repo + "/workflow/frag.smk"
#########1#########2#########3#########4#########5#########6#########7#########8
# #
# Basic Read and Alignment Processing of #
# Cell-free DNA Whole Genome Sequencing #
# #
#########1#########2#########3#########4#########5#########6#########7#########8
- Snakemake
# Make alignment index # Note: Upon first run, this rule will touch an empty file with the same path # as the index prefix. Thereafter, you can avoid repeat indexing when the # rule "sees" this empty file. For repo intergration testing with an # external reference, indexing can likewise be avoided with this empty # file at the external index location. rule cfdna_wgs_index: benchmark: benchdir + "/cfdna_wgs_index.benchmark.txt", container: cfdna_wgs_container, input: genome_fasta, log: logdir + "/cfdna_wgs_index.log", output: done = touch(genome_ref), params: out_prefix = genome_ref, script = cfdna_wgs_scriptdir + "/index.sh", threads = cfdna_wgs_threads, shell: """ bwa index -p {params.out_prefix} {input} &> {log} """
- Snakemake
# Adapter-trim and QC reads with fastp rule cfdna_wgs_fastp: benchmark: benchdir + "/{library}_cfdna_wgs_fastp.benchmark.txt", container: cfdna_wgs_container, input: read1 = cfdna_wgs_fastqs + "/{library}_raw_R1.fastq.gz", read2 = cfdna_wgs_fastqs + "/{library}_raw_R2.fastq.gz", log: cmd = logdir + "/{library}_cfdna_wgs_fastp.log", html = logdir + "/{library}_cfdna_wgs_fastp.html", json = logdir + "/{library}_cfdna_wgs_fastp.json", output: read1 = cfdna_wgs_fastqs + "/{library}_processed_R1.fastq.gz", read2 = cfdna_wgs_fastqs + "/{library}_processed_R2.fastq.gz", failed = cfdna_wgs_fastqs + "/{library}_failed_fastp.fastq.gz", unpaired1 = cfdna_wgs_fastqs + "/{library}_unpaired_R1.fastq.gz", unpaired2 = cfdna_wgs_fastqs + "/{library}_unpaired_R2.fastq.gz", params: script = cfdna_wgs_scriptdir + "/fastp.sh", threads = cfdna_wgs_threads, resources: mem_mb = 500, shell: """ {params.script} \ {input.read1} \ {input.read2} \ {log.html} \ {log.json} \ {output.read1} \ {output.read2} \ {output.failed} \ {output.unpaired1} \ {output.unpaired2} \ {params.threads} &> {log.cmd} """
- Shell script
#!/usr/bin/env bash set -o errexit # abort on nonzero exitstatus set -o nounset # abort on unbound variable set -o pipefail # don't hide errors within pipes # Script variables input_read1="${1}" input_read2="${2}" log_html="${3}" log_json="${4}" output_read1="${5}" output_read2="${6}" output_failed="${7}" output_unpaired1="${8}" output_unpaired2="${9}" params_threads="${10}" # Functions main(){ fastp_wrap $output_failed \ $input_read1 \ $input_read2 \ $log_html \ $log_json \ $output_read1 \ $output_read2 \ $output_unpaired1 \ $output_unpaired2 \ $params_threads } fastp_wrap(){ fastp --detect_adapter_for_pe \ --failed_out $output_failed \ --in1 $input_read1 \ --in2 $input_read2 \ --html $log_html \ --json $log_json \ --out1 $output_read1 \ --out2 $output_read2 \ --unpaired1 $output_unpaired1 \ --unpaired2 $output_unpaired2 \ --thread $params_threads } # Run main "$@"
- Snakemake
# Align reads with BWA rule cfdna_wgs_align: benchmark: benchdir + "/{library}_cfdna_wgs_align.benchmark.txt", container: cfdna_wgs_container, input: ref = genome_ref, read1 = cfdna_wgs_fastqs + "/{library}_processed_R1.fastq.gz", read2 = cfdna_wgs_fastqs + "/{library}_processed_R2.fastq.gz", log: logdir + "/{library}_cfdna_wgs_align.log", output: sort = cfdna_wgs_bams + "/{library}_raw.bam", index = cfdna_wgs_bams + "/{library}_raw.bam.bai", params: script = cfdna_wgs_scriptdir + "/align.sh", threads = 4, resources: mem_mb = 500, shell: """ {params.script} \ {input.ref} \ {input.read1} \ {input.read2} \ {params.threads} \ {output.sort} &> {log} """
- Shell script
#!/usr/bin/env bash input_ref=$1 input_r1=$2 input_r2=$3 threads=$4 output_sort=$5 bwa mem -M -t $threads \ $input_ref \ $input_r1 \ $input_r2 | samtools view -@ $threads -Sb - -o - | samtools sort -@ $threads - -o $output_sort samtools index -@ threads $output_sort
- Snakemake
# Remove PCR duplicates from aligned reads rule cfdna_wgs_dedup: benchmark: benchdir + "/{library}_cfdna_wgs_dedup.benchmark.txt", container: cfdna_wgs_container, input: cfdna_wgs_bams + "/{library}_raw.bam", log: logdir + "/{library}_cfdna_wgs_dedup.log", output: cfdna_wgs_bams + "/{library}_dedup.bam", params: script = cfdna_wgs_scriptdir + "/dedup.sh", threads = cfdna_wgs_threads, shell: """ {params.script} \ {input} \ {output} \ {params.threads} &> {log} """
- Shell script
#!/usr/bin/env bash set -o errexit # abort on nonzero exitstatus set -o nounset # abort on unbound variable set -o pipefail # don't hide errors within pipes # Script variables raw_bam="${1}" dedup_bam="${2}" threads="${3}" samtools sort -@ $threads -n -o - $raw_bam | samtools fixmate -m - - | samtools sort -@ $threads -o - - | samtools markdup -@ $threads -r - $dedup_bam samtools index $dedup_bam
- Snakemake
# Filter de-duplicated alignments. # Remove unmapped, not primary, and duplicate reads. Additional location filter by config bedfile variable. checkpoint cfdna_wgs_filter_alignment: benchmark: benchdir + "/{library}_cfdna_wgs_filter_alignment.benchmark.txt", container: cfdna_wgs_container, input: cfdna_wgs_bams + "/{library}_dedup.bam", log: logdir + "/{library}_cfdna_wgs_filter_alignment.log", output: cfdna_wgs_bams + "/{library}_filt.bam", params: script = cfdna_wgs_scriptdir + "/filter_alignment.sh", threads = cfdna_wgs_threads, shell: """ {params.script} \ {input} \ {params.threads} \ {output} &> {log} """
- Shell script
#!/usr/bin/env bash input=$1 threads=$2 output=$3 # Filter to reads that are # - Excluding any unmapped, not primary alignment, or duplicates # - Only MAPQ > 20 # DO NOT restrict to "proper pairs"- this clips long cfDNA fragments! samtools view -@ $threads -b -F 1284 -h -q 20 -o $output $input samtools index ${output}
- Snakemake
# Get read quality by FASTQC rule cfdna_wgs_fastqc: benchmark: benchdir+ "/{library}_{processing}_{read}_cfdna_wgs_fastqc.benchmark.txt", container: cfdna_wgs_container, input: cfdna_wgs_fastqs + "/{library}_{processing}_{read}.fastq.gz", log: logdir + "/{library}_{processing}_{read}_cfdna_wgs_fastqc.log", output: qcdir + "/{library}_{processing}_{read}_fastqc.html", qcdir + "/{library}_{processing}_{read}_fastqc.zip", params: outdir = qcdir, script = cfdna_wgs_scriptdir + "/fastqc.sh", threads = cfdna_wgs_threads, shell: """ {params.script} \ {input} \ {params.outdir} \ {params.threads} &> {log} """
- Shell script
#!/usr/bin/env bash set -o errexit # abort on nonzero exitstatus set -o nounset # abort on unbound variable set -o pipefail # don't hide errors within pipes # Script variables input="${1}" outdir="${2}" threads="${3}" # Functions fastqc --outdir $outdir \ --quiet \ --threads $threads $input
- Snakemake
# Get alignment QC using samtools rule cfdna_wgs_alignment_qc: container: cfdna_wgs_container, input: cfdna_wgs_bams + "/{library}_{processing}.bam", log: flagstat = logdir + "/{library}_{processing}_flagstat_cfdna_wgs_alignment_qc.log", samstat = logdir + "/{library}_{processing}_samstats_cfdna_wgs_alignment_qc.log", output: flagstat = qcdir + "/{library}_{processing}_flagstat.txt", samstat = qcdir + "/{library}_{processing}_samstats.txt", params: script = cfdna_wgs_scriptdir + "/alignment_qc.sh", threads = cfdna_wgs_threads, shell: """ {params.script} \ {input} \ {log.flagstat} \ {log.samstat} \ {output.flagstat} \ {output.samstat} \ {params.threads} """
- Shell script
#!/usr/bin/env bash set -o errexit # abort on nonzero exitstatus set -o nounset # abort on unbound variable set -o pipefail # don't hide errors within pipes # Script variables input="${1}" log_flagstat="${2}" log_samstat="${3}" output_flagstat="${4}" output_samstat="${5}" threads="${6}" # Functions main(){ flagstat $input $output_flagstat $log_flagstat $threads samstats $input $output_samstat $log_samstat $threads } flagstat(){ local input="${1}" local output="${2}" local log="${3}" local threads="${4}" # samtools flagstat -@ $threads $input > $output 2>$log } samstats(){ local input="${1}" local output="${2}" local log="${3}" local threads="${4}" # samtools stats -@ $threads $input > $output 2>$log } # Run main "$@"
- Snakemake
# Sequencing depth metrics via Picard rule cfdna_wgs_picard_depth: benchmark: benchdir + "/{library}_cfdna_wgs_picard_depth.benchmark.txt", container: cfdna_wgs_container, input: cfdna_wgs_bams + "/{library}_filt.bam", log: logdir + "/{library}_cfdna_wgs_picard_depth.log", output: qcdir + "/{library}_picard_depth.txt", params: script = cfdna_wgs_scriptdir + "/picard_depth.sh", threads = cfdna_wgs_threads, shell: """ {params.script} \ {input} \ {config[picard_jar]} \ {config[genome_fasta]} \ {output} """
- Shell script
#!/usr/bin/env bash input=$1 picard_jar=$2 genome_fasta=$3 output=$4 java -jar $picard_jar CollectWgsMetrics \ INPUT=$input \ OUTPUT=$output \ READ_LENGTH=150 \ REFERENCE_SEQUENCE=$genome_fasta
- Snakemake
# Get fragment sizes using deepTools rule cfdna_wgs_bampefragsize: benchmark: benchdir + "/cfdna_wgs_bampefragsize.benchmark.txt", container: cfdna_wgs_container, input: expand(cfdna_wgs_bams + "/{library}_filt.bam", library = CFDNA_WGS_LIBRARIES), log: logdir + "/cfdna_wgs_bampefragsize.log", output: raw = qcdir + "/deeptools_frag_lengths.txt", hist = qcdir + "/deeptools_frag_lengths.png", params: blacklist = config["blklist"], script = cfdna_wgs_scriptdir + "/bampefragsize.sh", threads = cfdna_wgs_threads, shell: """ {params.script} \ "{input}" \ {log} \ {output.hist} \ {output.raw} \ {params.blacklist} \ {params.threads} """
- Shell script
#!/usr/bin/env bash #!/usr/bin/env bash set -o errexit # abort on nonzero exitstatus set -o nounset # abort on unbound variable set -o pipefail # don't hide errors within pipes # Script variables input="${1}" log="${2}" output_hist="${3}" output_raw="${4}" blacklist="${5}" threads="${6}" bamPEFragmentSize --bamfiles $input \ --numberOfProcessors $threads \ --blackListFileName $blacklist \ --histogram $output_hist \ --maxFragmentLength 1000 \ --outRawFragmentLengths $output_raw
- Snakemake
# Make deeptools bamCoverage bedfile rule cfdna_wgs_bamcoverage: benchmark: benchdir + "/{library}_cfdna_wgs_bamcoverage.benchmark.txt", container: cfdna_wgs_container, input: cfdna_wgs_bams + "/{library}_filt.bam", log: logdir + "/{library}_cfdna_wgs_bamcoverage.log", output: qcdir + "/{library}_bamcoverage.bg", params: bin = "10000", blacklist = config["blklist"], script = cfdna_wgs_scriptdir + "/bamcoverage.sh", threads = cfdna_wgs_threads, shell: """ {params.script} \ {input} \ {output} \ {params.bin} \ {params.blacklist} \ {params.threads} &> {log} """
- Shell script
#!/usr/bin/env bash in_bam=$1 bin=$3 blacklist=$4 threads=$5 out_bg=$2 bamCoverage \ --bam $in_bam \ --binSize $bin \ --blackListFileName $blacklist \ --effectiveGenomeSize 2913022398 \ --extendReads \ --ignoreDuplicates \ --ignoreForNormalization chrX \ --normalizeUsing RPGC \ --numberOfProcessors $threads \ --outFileFormat bedgraph \ --outFileName $out_bg
- Snakemake
# Make deepTools plotCoverage coverage maps for all filtered bams rule cfdna_wgs_plotcoverage: benchmark: benchdir + "/cfdna_wgs_plotcoverage.benchmark.txt", container: cfdna_wgs_container, input: expand(cfdna_wgs_bams + "/{library}_filt.bam", library = CFDNA_WGS_LIBRARIES), log: logdir + "/cfdna_wgs_plotcoverage.log", output: raw = qcdir + "/cfdna_wgs_coverage.tsv", plot = qcdir + "/cfdna_wgs_coverage.pdf", params: blacklist = config["blklist"], script = cfdna_wgs_scriptdir + "/plotcoverage.sh", threads = cfdna_wgs_threads, shell: """ {params.script} \ "{input}" \ {params.blacklist} \ {params.threads} \ {output.raw} \ {output.plot} &> {log} """
- Shell script
#!/usr/bin/env bash in_bam_string=$1 blacklist=$2 threads=$3 out_raw=$4 out_plot=$5 plotCoverage \ --bamfiles $in_bam_string \ --blackListFileName $blacklist \ --extendReads \ --numberOfProcessors $threads \ --outRawCounts $out_raw \ --plotFile $out_plot \ --plotFileFormat pdf \ --skipZeros
- Snakemake
# Aggregate QC files using MultiQC rule cfdna_wgs_multiqc: benchmark: benchdir + "/cfdna_wgs_multiqc.benchmark.txt", container: cfdna_wgs_container, input: expand(logdir + "/{library}_cfdna_wgs_fastp.json", library = CFDNA_WGS_LIBRARIES), expand(qcdir + "/{library}_{processing}_{read}_fastqc.zip", library = CFDNA_WGS_LIBRARIES, processing = ["raw", "processed", "unpaired"], read = ["R1","R2"]), expand(qcdir + "/{library}_{processing}_samstats.txt", library = CFDNA_WGS_LIBRARIES, processing = ["raw","filt"]), expand(qcdir + "/{library}_{processing}_flagstat.txt", library = CFDNA_WGS_LIBRARIES, processing = ["raw","filt"]), expand(qcdir + "/{library}_picard_depth.txt", library = CFDNA_WGS_LIBRARIES), qcdir + "/deeptools_frag_lengths.txt", qcdir + "/cfdna_wgs_coverage.tsv", log: logdir + "/cfdna_wgs_multiqc.log", output: qcdir + "/cfdna_wgs_multiqc.html", qcdir + "/cfdna_wgs_multiqc_data/multiqc_fastqc.txt", qcdir + "/cfdna_wgs_multiqc_data/multiqc_samtools_stats.txt", qcdir + "/cfdna_wgs_multiqc_data/multiqc_picard_wgsmetrics.txt", qcdir + "/cfdna_wgs_multiqc_data/multiqc_samtools_flagstat.txt", params: out_dir = qcdir, out_name = "cfdna_wgs_multiqc", script = cfdna_wgs_scriptdir + "/multiqc.sh", threads = cfdna_wgs_threads, shell: """ {params.script} \ "{input}" \ {params.out_name} \ {params.out_dir} &> {log} """
- Shell script
#!/usr/bin/env bash set -o errexit # abort on nonzero exitstatus set -o nounset # abort on unbound variable set -o pipefail # don't hide errors within pipes # Script variables input="${1}" out_name="${2}" out_dir="${3}" # Functions multiqc $input \ --force \ --outdir $out_dir \ --filename $out_name
- Snakemake
# Make a tab-separated aggregate QC table checkpoint cfdna_wgs_make_qc_tsv: benchmark: benchdir + "/cfdna_wgs_make_qc_tsv.benchmark.txt", container: cfdna_wgs_container, input: fq = qcdir + "/cfdna_wgs_multiqc_data/multiqc_fastqc.txt", mqsam = qcdir + "/cfdna_wgs_multiqc_data/multiqc_samtools_stats.txt", mqflag = qcdir + "/cfdna_wgs_multiqc_data/multiqc_samtools_flagstat.txt", picard = qcdir + "/cfdna_wgs_multiqc_data/multiqc_picard_wgsmetrics.txt", deeptools_frag = qcdir + "/deeptools_frag_lengths.txt", deeptools_cov = qcdir + "/cfdna_wgs_coverage.tsv", log: logdir + "/cfdna_wgs_make_qc_tsv.log", output: readqc = qcdir + "/cfdna_wgs_read_qc.tsv", fraglen = qcdir + "/cfdna_wgs_frag_len.tsv", params: script = cfdna_wgs_scriptdir + "/make_qc_tsv.R", shell: """ Rscript {params.script} \ {input.fq} \ {input.mqsam} \ {input.mqflag} \ {input.picard} \ {input.deeptools_frag} \ {input.deeptools_cov} \ {output.readqc} \ {output.fraglen} >& {log} """
- Rscript
#!/usr/bin/env Rscript # # Unit test variables ## mqc_dir="test/analysis/qc/cfdna_wgs_multiqc_data" ## fastqc_input = paste0(mqc_dir,"/multiqc_fastqc.txt") ## samstats_input = paste0(mqc_dir, "/multiqc_samtools_stats.txt") ## flagstats_input = paste0(mqc_dir, "/multiqc_samtools_flagstat.txt") ## picard_input = paste0(mqc_dir, "/multiqc_picard_wgsmetrics.txt") ## deeptools_frag_input = "test/analysis/qc/deeptools_frag_lengths.txt" ## deeptools_cov_input = "test/analysis/qc/cfdna_wgs_coverage.tsv" args = commandArgs(trailingOnly = TRUE) fastqc_input = args[1] samstats_input = args[2] flagstats_input = args[3] picard_input = args[4] deeptools_frag_input = args[5] deeptools_cov_input = args[6] readqc_out_tbl = args[7] frag_len_out_tbl = args[8] library(tidyverse) process_multiqc_fastqc = function(multiqc_fastqc_input){ as_tibble(read.table(multiqc_fastqc_input, header = TRUE, sep = '\t', stringsAsFactors = FALSE)) %>% mutate(library = substr(Filename,1,6)) %>% mutate(read = ifelse(grepl("R1", Filename), "read1", "read2")) %>% mutate(fastq_processing = gsub("_.*$","",substr(Sample, 8, length(Sample)))) %>% select(!c(Sample,File.type,Encoding)) %>% pivot_wider( names_from = c(read,fastq_processing), values_from = !c(library,read,fastq_processing)) } fastqc = process_multiqc_fastqc(fastqc_input) as_tibble(read.table(fastqc_input, header = TRUE, sep = '\t', stringsAsFactors = FALSE)) %>% mutate(library = substr(Sample, 1, 6)) %>% mutate(bam_processing = gsub("_.*$","",substr(Sample, 8, length(Sample)))) %>% select(!c(Sample)) %>% pivot_wider( names_from = c(bam_processing), values_from = !c(library, bam_processing)) process_multiqc_samfile = function(multiqc_samfile){ read_tsv(multiqc_samfile) %>% mutate(library = substr(Sample, 1, 6)) %>% mutate(bam_processing = gsub("_.*$","",gsub("lib..._","", Sample))) %>% select(!c(Sample)) %>% pivot_wider( names_from = c(bam_processing), values_from = !c(library, bam_processing)) } samstats = process_multiqc_samfile(samstats_input) flagstats = process_multiqc_samfile(flagstats_input) deeptools_frag = read_tsv(deeptools_frag_input, col_names = c("frag_len","frag_count","file"), skip = 1) %>% filter(frag_len < 500) %>% mutate(library = substr(gsub("^.*lib", "lib", file), 1,6)) %>% mutate(frag_len = sub("^", "frag_len", frag_len)) %>% select(library, frag_len, frag_count) %>% pivot_wider( names_from = frag_len, values_from = frag_count) picard = as_tibble(read.table(picard_input, header = TRUE, sep = '\t', stringsAsFactors = FALSE)) %>% mutate(library = Sample) deeptools_cov = read_tsv(deeptools_cov_input, skip = 1) %>% pivot_longer(!c(`#'chr'`, `'start'`,`'end'`), names_to = "file", values_to = "cnt") %>% rename(chr = `#'chr'`, start = `'start'`, end = `'end'`) %>% mutate(library = substr(file, 2, 7)) %>% group_by(library) %>% summarise( mean_cov = mean(cnt), median_cov = median(cnt), ) readqc = fastqc %>% left_join(samstats, by = "library") %>% left_join(flagstats, by = "library") %>% left_join(deeptools_frag, by = "library") %>% left_join(picard, by = "library") %>% left_join(deeptools_cov, by = "library") write.table(readqc, file = readqc_out_tbl, row.names = F, sep = '\t', quote = F) all_frag_len = data.frame(frag_len = 1:500) frag_len = readqc %>% select(starts_with("frag_len") | matches("library")) %>% pivot_longer(!library, names_to = "frag_len", values_to = "count") %>% mutate(frag_len = as.numeric(gsub("frag_len","",frag_len))) %>% mutate(count = as.numeric(count)) %>% pivot_wider(names_from = library, values_from = count) %>% right_join(all_frag_len) %>% arrange(frag_len) %>% replace(is.na(.), 0) write_tsv(frag_len, file = frag_len_out_tbl)
- Snakemake
rule downsample_bams: container: cfdna_wgs_container, input: cfdna_wgs_bams + "/{library}_filt.bam", output: touch(logdir + "/{library}_{downsample}_downsample.done"), params: out_dir = cfdna_wgs_bams, script = cfdna_wgs_scriptdir + "/downsample_bams.sh", suffix = "_filt.bam", threads = cfdna_wgs_threads, shell: """ {params.script} \ {input} \ {wildcards.downsample} \ {params.out_dir} \ {params.suffix} \ {params.threads} """
- Shell script
#!/usr/bin/env bash # For unit testing # in_bam=test/analysis/cfdna_wgs/bams/lib001_filt.bam # milreads=0.001 # checker=test/tmp/lib001_ds0.001.txt # outdir=test/analysis/cfdna_wgs/bams # suffix=_filt.bam # threads=4 in_bam="${1}" milreads="${2}" outdir="${3}" suffix="${4}" threads="${5}" downsample(){ # Derived variables milreads_full=$(awk -v milreads="${milreads}" 'BEGIN{milreads_full=(1000000*milreads); print milreads_full}') factor=$(samtools idxstats $in_bam | cut -f3 | awk -v count=$milreads_full 'BEGIN {total=0} {total += $1} END {print count/total}') base=$(basename -s $suffix $in_bam) out_bam=${outdir}/${base}_ds${milreads}.bam # # Downsample if [[ $factor < 1 ]]; then samtools view -s $factor -b -@ $threads $in_bam > $out_bam fi } downsample $in_bam $milreads $suffix
# If downsample occured, then write filename into this per-library log, else leave the log file blank
rule log_dowsample:
input: logdir + "/{library}_{downsample}_downsample.done",
output: logdir + "/{library}_{downsample}_made",
params:
bamdir = cfdna_wgs_bams,
shell:
"""
dspath={params.bamdir}/{wildcards.library}_ds{wildcards.downsample}.bam
if [ -f $dspath ]; then echo "$dspath" > {output}; else touch {output}; fi
"""
# Use the downsampled bam logs to make a single text file of conditionally executed final targets.
# Specifically in this example, log text lines are in the form
# cfdna_wgs_bams + "/{library}_ds{downsample}_frag90_150.bam" to setup conditional execution of fragment filtering ONLY on downsampled bams
# Note alternative delimiter "~" to sed allows cfdna_wgs_wigs as param
checkpoint ds_cond_target_list:
input: expand(logdir + "/{library}_{downsample}_made", library = CFDNA_WGS_LIBRARIES, downsample = DOWNSAMPLE),
output: logdir + "/ds_final_targets",
params:
outdir = cfdna_wgs_bams,
frag_distro=config["frag_distro"]
shell:
"""
if [ -f {output} ]; then rm {output}; fi
cat {input} > {output}
sed -i 's~^.*lib~{params.outdir}/lib~g' {output}
sed -i 's/.bam$/_frag{params.frag_distro}.bam/g' {output}
"""
# Function jsut pulls the final target names out of ds_final_targets
def get_ds_targets(wildcards):
with open(checkpoints.ds_cond_target_list.get(**wildcards).output[0], "r") as f:
non_empty_files = [l.strip() for l in f.readlines()]
return non_empty_files
# This rule allows execution of rules which will generate the conditional targets in ds_cond_target_list
rule make_ds_targets:
input:
get_ds_targets
output: logdir + "/aggregate_output"
run:
with open(output[0], "w") as f:
f.write("\n".join(input))
rule frag_filt:
container: cfdna_wgs_container,
input:
main = cfdna_wgs_bams + "/{library}_ds{downsample}.bam",
check = logdir + "/{library}_{downsample}_made",
output:
nohead = temp(cfdna_wgs_bams + "/{library}_ds{downsample}_frag{frag_distro}.nohead"),
onlyhead = temp(cfdna_wgs_bams + "/{library}_ds{downsample}_frag{frag_distro}.only"),
final = cfdna_wgs_bams + "/{library}_ds{downsample}_frag{frag_distro}.bam",
params:
script = cfdna_wgs_scriptdir + "/frag_filt.sh",
threads = cfdna_wgs_threads,
shell:
"""
frag_min=$(echo {wildcards.frag_distro} | sed -e "s/_.*$//g")
frag_max=$(echo {wildcards.frag_distro} | sed -e "s/^.*_//g")
{params.script} \
{input.main} \
{output.nohead} \
$frag_min \
$frag_max \
{config[threads]} \
{output.onlyhead} \
{output.final}
"""
- Shell script
#!/usr/bin/env bash # Steps ## Filter by absolute value of TLEN for each read sambamba view -t $5 $1 | awk -F'\t' -v upper="$4" 'sqrt($9*$9) < upper {print $0}' | awk -F'\t' -v lower="$3" 'sqrt($9*$9) > lower {print $0}'> $2 ## Restore header sambamba view -H $1 > $6 cat $6 $2 | sambamba view -t 4 -S -f bam /dev/stdin | sambamba sort -t 4 -o $7 /dev/stdin
#########1#########2#########3#########4#########5#########6#########7#########8
# #
# Fragmentomic Analysis of Cell-free DNA Whole Genome Sequencing #
# #
#########1#########2#########3#########4#########5#########6#########7#########8
- Snakemake
rule make_gc_map_bind: container: cfdna_wgs_container, input: gc5mb = config["gc5mb"], blklist = config["blklist"], log: logdir + "/make_gc_map_bind.log", output: refdir + "/keep_5mb.bed", params: script = cfdna_wgs_scriptdir + "/make_gc_map_bind.sh", shell: """ {params.script} \ {input.gc5mb} \ {input.blklist} \ {output} &> {log} """
- Shell script
gc5mb="${1}" blklist="${2}" keep="${3}" bedtools intersect -a $gc5mb -b $blklist -v -wa | grep -v _ | awk '{ if ($4 >= 0.3) print $0 }' > $keep
- error may be multimappers https://www.biostars.org/p/55149/
- Snakemake
# Make a bed file from filtered bam rule filt_bam_to_frag_bed: benchmark: benchdir + "/{library}_filt_bam_to_frag_bed.benchmark.txt", container: cfdna_wgs_container, input: cfdna_wgs_bams + "/{library}_filt.bam", log: logdir + "/{library}_filt_bam_to_frag_bed.log", output: cfdna_wgs_frag_beds + "/{library}_filt.bed", params: fasta = genome_fasta, script = cfdna_wgs_scriptdir + "/filt_bam_to_frag_bed.sh", threads = cfdna_wgs_threads, shell: """ {params.script} \ {input} \ {params.fasta} \ {params.threads} \ {output} """
- Shell script
#!/usr/bin/env bash # Snakemake variables input_bam="$1" params_fasta="$2" threads="${3}" output_frag_bed="$4" # Function bam_to_frag(){ # Ensure name-sorted bam file samtools sort -@ $threads -n -o - $1 | samtools fixmate -@ $threads -m -r - - | # Make bedpe bedtools bamtobed -bedpe -i - | # Filter any potential non-standard alignments awk '$1==$4 {print $0}' | awk '$2 < $6 {print $0}' | # Create full-fragment bed file awk -v OFS='\t' '{print $1,$2,$6}' | # Annotate with GC content and fragment length bedtools nuc -fi $2 -bed stdin | # Convert back to standard bed with additional columns awk -v OFS='\t' '{print $1,$2,$3,$5,$12}' | sed '1d' > $3 } # Run command bam_to_frag $input_bam \ $params_fasta \ $output_frag_bed
- Snakemake
# Make GC distributions rule gc_distro: benchmark: benchdir + "/{library}_cfdna_wgs_gc_distro.benchmark.txt", container: cfdna_wgs_container, input: cfdna_wgs_frag_beds + "/{library}_filt.bed", log: logdir + "/{library}_cfdna_wgs_gc_distro.log", output: cfdna_wgs_frag_gc_distros + "/{library}_gc_distro.csv", params: script = cfdna_wgs_scriptdir + "/gc_distro.R", shell: """ Rscript {params.script} \ {input} \ {output} \ > {log} 2>&1 """
- Rscript
#!/usr/bin/env Rscript args = commandArgs(trailingOnly = TRUE) bed_file = args[1] distro_file = args[2] library(tidyverse) # Read in modified bed bed = read.table(bed_file, sep = '\t') names(bed) = c("chr","start","end","gc_raw","len") # Generate distribution csv distro = bed %>% # Round GC mutate(gc_strata = round(gc_raw, 2)) %>% # Count frags per strata count(gc_strata) %>% # Get fraction frags mutate(fract_frags = n/sum(n)) %>% mutate(library_id = gsub("_frag.bed", "", gsub("^.*lib", "lib", bed_file))) %>% select(library_id,gc_strata,fract_frags) %>% write.csv(file = distro_file, row.names = F)
- Snakemake
# Make healthy GC distributions summary file rule healthy_gc: benchmark: benchdir + "/cfdna_wgs_healthy_gc.benchmark.txt", container: cfdna_wgs_container, input: expand(cfdna_wgs_frag_gc_distros + "/{library}_gc_distro.csv", library = CFDNA_WGS_HEALTHY_LIBRARIES), log: logdir + "/cfdna_wgs_healthy_gc.log", output: cfdna_wgs_frag_gc_distros + "/healthy_med.rds", params: distro_dir = cfdna_wgs_frag_gc_distros, script = cfdna_wgs_scriptdir + "/healthy_gc.R", shell: """ Rscript {params.script} \ {params.distro_dir} \ "{input}" \ {output} > {log} 2>&1 """
- Rscript
#!/usr/bin/env Rscript args = commandArgs(trailingOnly = TRUE) distro_dir = args[1] healthy_libs_str = args[2] healthy_med_file = args[3] library(tidyverse) healthy_libs_distros = unlist(strsplit(healthy_libs_str, " ")) read_in_gc = function(gc_csv){ read.csv(gc_csv, header = T) } healthy_list = lapply(healthy_libs_distros, read_in_gc) # Bind healthy_all = do.call(rbind, healthy_list) # Summarize healthy_med = healthy_all %>% group_by(gc_strata) %>% summarise(med_frag_fract = median(fract_frags)) # Save saveRDS(healthy_med, file = healthy_med_file)
- Snakemake
# Sample fragments by healthy GC proportions rule cfdna_wgs_gc_sample: benchmark: benchdir + "/{library}_cfdna_wgs_gc_sample.benchmark.txt", container: cfdna_wgs_container, input: frag_bed = cfdna_wgs_frag_beds + "/{library}_filt.bed", healthy_med = cfdna_wgs_frag_gc_distros + "/healthy_med.rds", log: logdir + "/{library}_cfdna_wgs_gc_sample.log", output: cfdna_wgs_frag_beds + "/{library}_sampled_frag.bed", params: script = cfdna_wgs_scriptdir + "/gc_sample.R", shell: """ Rscript {params.script} \ {input.healthy_med} \ {input.frag_bed} \ {output} > {log} 2>&1 """
- Rscript
#!/usr/bin/env Rscript args = commandArgs(trailingOnly = TRUE) healthy_med = args[1] frag_file = args[2] sampled_file = args[3] library(tidyverse) healthy_fract = readRDS(healthy_med) frag_file = read.table(frag_file, sep = '\t', header = F) frag_bed = frag_file names(frag_bed) = c("chr", "start", "end", "gc_raw", "len") frag = frag_bed %>% # Round off the GC strata mutate(gc_strata = round(gc_raw, 2)) %>% # Join the median count of fragments per strata in healthies # Use this later as sampling weight left_join(healthy_fract, by = "gc_strata") # Determine frags to sample by counts in strata for which healthies had highest count stratatotake = frag$gc_strata[which.max(frag$med_frag_fract)] fragsinmaxstrata = length(which(frag$gc_strata == stratatotake)) fragstotake = round(fragsinmaxstrata/stratatotake) sampled = frag %>% filter(!is.na(med_frag_fract)) %>% slice_sample(., n = nrow(.), weight_by = med_frag_fract, replace = T) %>% select(chr, start, end, len, gc_strata) write.table(sampled, sep = "\t", col.names = F, row.names = F, quote = F, file = sampled_file)
- Snakemake
# Sum fragments in short and long length groups rule frag_sum: benchmark: benchdir + "/{library}_frag_sum.benchmark.txt", container: cfdna_wgs_container, input: cfdna_wgs_frag_beds + "/{library}_sampled_frag.bed", log: logdir + "/{library}_cfdna_wgs_frag_window_sum.log", output: short = cfdna_wgs_frag_beds + "/{library}_norm_short.bed", long = cfdna_wgs_frag_beds + "/{library}_norm_long.bed", params: script = cfdna_wgs_scriptdir + "/frag_window_sum.sh", threads = cfdna_wgs_threads, shell: """ {params.script} \ {input} \ {output.short} {output.long} &> {log} """
- Shell script
#!/usr/bin/env bash input_frag="$1" output_short="$2" output_long="$3" # Functions make_short(){ cat $1 | awk '{if ($4 >= 100 && $5 <= 150) print $0}' > $2 } make_long(){ cat $1 | awk '{if ($4 >= 151 && $5 <= 220) print $0}' > $2 } # Run command make_short $input_frag $output_short make_long $input_frag $output_long
- Snakemake
# Count short and long fragments intersecting kept genomic windows rule frag_window_count: benchmark: benchdir + "/{library}_cfdna_wgs_frag_window_int.benchmark.txt", container: cfdna_wgs_container, input: short = cfdna_wgs_frag_beds + "/{library}_norm_short.bed", long = cfdna_wgs_frag_beds + "/{library}_norm_long.bed", matbed = refdir + "/keep_5mb.bed", log: logdir + "/{library}_cfdna_wgs_frag_window_int.log", output: short = cfdna_wgs_frag_counts + "/{library}_cnt_short.tmp", long = cfdna_wgs_frag_counts + "/{library}_cnt_long.tmp", params: script = cfdna_wgs_scriptdir + "/frag_window_int.sh", threads = threads, shell: """ {params.script} \ {input.short} \ {input.matbed} \ {output.short} {params.script} \ {input.long} \ {input.matbed} \ {output.long} """
- Shell script
#!/usr/bin/env bash input=$1 keep_bed=$2 output=$3 bedtools intersect -c \ -a $keep_bed \ -b $input > $output
- Snakemake
# Merge short and long fragment counts by genomic poistion for all libraries rule cfdna_wgs_count_merge: benchmark: benchdir + "/cfdna_wgs_count_merge.benchmark.txt", container: cfdna_wgs_container, input: expand(cfdna_wgs_frag_counts + "/{library}_cnt_{length}.tmp", library = CFDNA_WGS_LIBRARIES, length = ["short","long"]), log: logdir + "/cfdna_wgs_count_merge.log", output: cfdna_wgs_frag + "/frag_counts.tsv", params: counts_dir = cfdna_wgs_frag + "/counts", script = cfdna_wgs_scriptdir + "/count_merge.sh", threads = cfdna_wgs_threads, shell: """ {params.script} \ {params.counts_dir} \ {output} &> {log} """
- Shell script
# For unit testing #counts_dir="/home/jeszyman/mpnst/analysis/cfdna-wgs/frag/counts" #out_tsv="/home/jeszyman/mpnst/analysis/cfdna-wgs/frag/frag_counts.tsv" # Define variables counts_dir="${1}" out_tsv="${2}" # Remove the existing aggregate file if present if [ -f $out_tsv ]; then rm $out_tsv; fi #touch $out_tsv # Make aggregate file for file in ${counts_dir}/*; do # Add file name to each line awk '{{print FILENAME (NF?"\t":"") $0}}' $file | # Modify file name to library id sed 's/^.*lib/lib/g' | sed 's/_.*_/\t/g' | # Cleanup "tmp" sed 's/.tmp//g' | # Send to output sed 's/\.bed//g' >> $out_tsv done # Add a header sed -i '1 i\library len_class chr start end gc count' $out_tsv
#!/usr/bin/env bash output=$1 declare -a array2=$2 if [ -f $output ]; then \rm $output; fi for file in ${array2[@]}; do awk '{{print FILENAME (NF?"\t":"") $0}}' $file | sed 's/^.*lib/lib/g' | sed 's/_.*_/\t/g' | # Cleanup "tmp" sed 's/.tmp//g' | sed 's/\.bed//g' >> $output done # Add a header sed -i '1 i\library len_class chr start end count' $out_tsv
- Snakemake
rule unit_cent_sd: benchmark: benchdir + "/unit_cent_sd.benchmark.txt", container: cfdna_wgs_container, input: cfdna_wgs_frag + "/frag_counts.tsv", log: logdir + "/unit_cent_sd.log", output: cfdna_wgs_frag + "/ratios.tsv", params: script = cfdna_wgs_scriptdir + "/make_ratios.R", shell: """ Rscript {params.script} \ {input} {output} > {log} 2>&1 """
- Rscript
#!/usr/bin/env Rscript # For unit testing frags_tsv = "test/analysis/cfdna_wgs/frag/frag_counts.tsv" ratios_tsv = "/home/jeszyman/mpnst/analysis/cfdna-wgs/frag/ratios.tsv" args = commandArgs(trailingOnly = TRUE) frags_tsv = args[1] ratios_tsv = args[2] # Load necessary packages library(tidyverse) # Load aggregate frag tsv frags = read_tsv(frags_tsv) # From per-position, per library short and long fragment counts, zero-centered fragment ratio # See https://github.com/cancer-genomics/reproduce_lucas_wflow/blob/master/analysis/fig2a.Rmd ratios = frags %>% mutate_at(vars(start, end, count), as.numeric) %>% # Put lib-bin short and long values on same row in order to make per-row ratios pivot_wider(names_from = len_class, values_from = count, values_fn = function(x) mean(x)) %>% mutate(fract = short/long) %>% select(library, chr, start, end, fract) %>% # Zero center by library group_by(library) %>% mutate(ratio.centered = scale(fract, scale=F)[,1]) write_tsv(ratios, file = ratios_tsv)
- Based on cfDNA fragmentomics cite:mathios2021
#########1#########2#########3#########4#########5#########6#########7#########8
# #
# Integration Testing Snakefile for Analysis of Cell-free DNA #
# Whole Genome Sequencing Copy Number Alteration and Fragmentomics #
# #
#########1#########2#########3#########4#########5#########6#########7#########8
# Load necessary packages for snakemake run
import pandas as pd
import re
import numpy as np
# Variable naming
benchdir = config["benchdir"]
cfdna_wgs_repo = config["cfdna_wgs_repo"]
cfdna_wgs_scriptdir = config["cfdna_wgs_scriptdir"]
logdir = config["logdir"]
threads = config["threads"]
# Suggested directory structure:
analysis = config["datadir"] + "/analysis"
cfdna_wgs = config["datadir"] + "/analysis/cfdna_wgs"
cfdna_wgs_cna = config["datadir"] + "/analysis/cfdna_wgs/cna"
cfdna_wgs_frag = config["datadir"] + "/analysis/cfdna_wgs/frag"
# Terminal variable paths:
# (These variables are used directly in the cna snakefile)
cfdna_wgs_cna_in_bams = cfdna_wgs_cna + "/input_bams"
cfdna_wgs_cna_frag_bams = cfdna_wgs_cna + "/frag_bams"
cfdna_wgs_cna_wigs = cfdna_wgs_cna + "/wigs"
cfdna_wgs_cna_ichor_nopon = cfdna_wgs_cna + "/ichor_nopon"
cfdna_wgs_frag_input_bams = cfdna_wgs_cna + "/input_bams"
cfdna_wgs_frag_beds = cfdna_wgs_frag + "/beds"
cfdna_wgs_frag_counts = cfdna_wgs_frag + "/counts"
refdir = config["datadir"] + "/ref"
# Additional variable names used directly in the cna snakefile:
chrom_sizes = config["chrom_sizes"]
genome_fasta = "/mnt/ris/aadel/Active/mpnst/inputs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna"
#TMP_FRAG_LIBS = ["lib001_filt","lib002_filt"]
#chrs = "chr8"
chrs = "chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY,chrM",
keep_bed = refdir + "/hg38_keep.bed",
blklist = config["blklist"]
genome_ref = config["genome_ref"]
FRAG_DISTROS = config["frag_distro"]
cfdna_wgs_threads = config["threads"]
cfdna_wgs_scriptdir = config["cfdna_wgs_scriptdir"]
cfdna_wgs_container = config["cfdna_wgs_container"]
default_container = config["default_container"]
autosome_bed = refdir + "/hg38_autosomes.bed",
cfdna_wgs_fastqs = cfdna_wgs + "/fastqs"
cfdna_wgs_bams = cfdna_wgs + "/bams"
qc = config["datadir"] + "/qc"
# cfdna_wgs_container = config["cfdna_wgs_container"]
# cfdna_wgs_cna_bam_inputs = config["dir"]["data"] + "/bam/filt"
# cfdna_wgs_cna_bam_fragfilt = config["dir"]["data"] + "/bam/frag"
# wig = config["dir"]["data"] + "/wig"
# ichor = config["dir"]["data"] + "/ichor"
# cfdna_wgs_logs = config["dir"]["data"] + "logs/cfdna_wgs"
# ichor_nopon = config["dir"]["data"] + "/ichor_nopon"
libraries = pd.read_table(config["datadir"] + "/inputs/libraries.tsv")
readable = []
for x in libraries.file:
readable.append(os.access(x, os.R_OK))
libraries['readable']=readable
cfdna_libraries = libraries
cfdna_libraries = cfdna_libraries[cfdna_libraries.library_type == "wgs"]
cfdna_libraries = cfdna_libraries[cfdna_libraries.isolation_type == "cfdna"]
cfdna_libraries = cfdna_libraries[cfdna_libraries.readable == True]
library_indict = cfdna_libraries["library"].tolist()
file_indict = cfdna_libraries["file"].tolist()
lib_dict = dict(zip(library_indict, file_indict))
CFDNA_WGS_LIBRARIES = list(lib_dict.keys())
cna_libraries = pd.read_table(config["datadir"] + "/inputs/cna_libraries.tsv")
readable = []
for x in cna_libraries.bam_file:
readable.append(os.access(x, os.R_OK))
cna_libraries['readable']=readable
cna_libraries = cna_libraries[cna_libraries.readable == True]
library_indict = cna_libraries["library"].tolist()
file_indict = cna_libraries["bam_file"].tolist()
lib_dict = dict(zip(library_indict, file_indict))
CNA_WGS_LIBRARIES = list(lib_dict.keys())
rule all:
input:
# # From this snakefile:
# # cfdna_wgs_symlink:
# expand(cfdna_wgs_cna_in_bams +
# "/{library}.bam",
# library = lib_dict.keys()),
# # From cna.smk
# # cna_frag_filt:
# expand(cfdna_wgs_cna_frag_bams +
# "/{library}_frag{frag_distro}.bam",
# library = CNA_WGS_LIBRARIES,
# frag_distro = FRAG_DISTROS),
# # bam_to_wig:
# expand(cfdna_wgs_cna_wigs +
# "/{library}_frag{frag_distro}.wig",
# library = CNA_WGS_LIBRARIES,
# frag_distro = FRAG_DISTROS),
# # ichor_nopon:
# expand(cfdna_wgs_cna_ichor_nopon +
# "/{library}_frag{frag_distro}.cna.seg",
# library = CNA_WGS_LIBRARIES,
# frag_distro = FRAG_DISTROS),
# From frag.smk
# make_gc_map_bind:
refdir + "/keep_5mb.bed",
# filt_bam_to_frag_bed:
expand(cfdna_wgs_frag_beds +
"/{library}_filt.bed",
library = CNA_WGS_LIBRARIES),
# # gc_distro:
# expand(cfdna_wgs_frag_gc_distros +
# "/{library}_gc_distro.csv",
# library = CNA_WGS_LIBRARIES),
# # healthy_gc:
# cfdna_wgs_frag_gc_distros + "/healthy_med.rds",
# #
# expand(cfdna_wgs_frag_beds +
# "/{library}_sampled_frag.bed",
# library = CNA_WGS_LIBRARIES),
# expand(cfdna_wgs_frag_beds) /
# "{library}_norm_{length}.bed",
# library = CNA_WGS_LIBRARIES,
# length = ["short", "long"]),
expand(cfdna_wgs_frag_counts +
"/{library}_cnt_{length}.tmp",
library = CNA_WGS_LIBRARIES,
length = ["short", "long"]),
cfdna_wgs_frag + "/frag_counts.tsv",
#
# unit_cent_sd:
cfdna_wgs_frag + "/ratios.tsv",
- Snakemake
# Symlink input bams rule cfdna_wgs_symlink: container: cfdna_wgs_container, input: lambda wildcards: lib_dict[wildcards.library], output: cfdna_wgs_cna_in_bams + "/{library}.bam", shell: """ ln --force --relative --symbolic {input} {output} """
include: cfdna_wgs_repo + "/workflow/reads.smk"
include: cfdna_wgs_repo + "/workflow/cna.smk"
include: cfdna_wgs_repo + "/workflow/frag.smk"
This repository hosts a snakemake workflow for basic processing of whole-genome sequencing reads from cell-free DNA.
Master branch of the repository contains most recent developments while stable versions are saved as terminal branches (e.g. stable.1.0.0).
Directory workflow
contains two types of workflows- process-focused snakefiles (reads.smk, cna.smk, frag.smk) suitable for integration into another snakemake pipeline using the :include command, and the _int_test snakefile with examples of such integration using the repository test data.
- All software needed for the pipeline is present within the associated docker container (see
docker
and https://hub.docker.com/repository/docker/jeszyman/cfdna_wgs/general). - See the example configuration yaml
config/int_test.yaml
and wrapper workflowworkflow/int_test
for necessary run conditions.
- [2023-01-26 Thu] - Version 9.1.0: Repo cleanup
- [2023-01-26 Thu] - Version 9.0.0: Removed -f 3 flag for perfectly matched pairs in samtools filtering as the black from BWA removes some fragments at a set max length. Added framework for benchmark analysis. Added conditional execution of downsampling. Removed (temporarily) final wig and ichor commands of CNA as these don’t currently run correctly without full genome alignment, so can’t be validated on test data. Added local documentation of cfdna-wgs dockerfile.
- [2023-01-21 Sat] - Version 8.0.0: Corrected rule filt_bam_to_frag_bed to fix mates of inputs, which seems to prevent errors in the bamtobed call. Frag_window_count now uses windows of consistent 5 Mb size, which are generated from rule make_gc_map_bind. Added a merged fragment counts file and zero-centered unit SD counts.
- [2022-12-07 Wed] - Version 7.0.0: Added copy number alteration and DELFI fragmentomics.
- [2022-10-17 Mon] - Version 6.0.0: Using fastp for read trimming (replaces trimmomatic). Simplified naming schema. Removed downsampling (will reinstate in later version).
- [2022-09-08 Thu] - Version 5.3.0: some minor name changes
- [2022-08-19 Fri] - Version 5.2.0 validated: Adds bamCoverage and plotCoverage from deeptools. Benchmarks BWA.
- [2022-08-09 Tue] - Version 5.1.0 validated: Added cfdna wgs-specific container for each rule, referenced to config
- [2022-08-05 Fri] - Version 5.0.0 validated: Added a symlink rule based on python dictionary. Added repo-specific output naming, added checks for sequence type and file readability to input tsv.
- [2022-06-27 Mon] - Version 4 validated. Further expanded read_qc.tsv table. Removed bam post-processing step and added a more expansive bam filtering step. Updated downsampling to work off filtered alignments.
- [2022-06-26 Sun] - Version 3.2 validated. Expanded the qc aggregate table and added some comments.
- [2022-06-24 Fri] - Validate version 3.1 which includes genome index build as a snakefile rule.
- [2022-06-24 Fri] - Validated version 3 with read number checkpoint for down-sampling.
- [2022-05-31 Tue] - Conforms to current biotools best practices.
- [2022-04-29 Fri] - Moved multiqc to integration testing as inputs are dependent on final sample labels. Integration testing works per this commit.
- kill v7- not working for CNA
- for file in ./*; do base=$(basename
$file); $ (str = tail -n1 $file); echo $base $str; done
#########1#########2#########3#########4#########5#########6#########7#########8
# #
# Copy-number Alteration Analysis of #
# Cell-free DNA Whole Genome Sequencing #
# #
# #
#########1#########2#########3#########4#########5#########6#########7#########8
-
Snakemake
# Use readCounter to create windowed wig from bam file rule bam_to_wig: benchmark: benchdir + "/{library}_ds{downsample}_{frag_distro}_cfdna_wgs_bam_to_wig.benchmark.txt", container: cfdna_wgs_container, input: cfdna_wgs_bams + "/{library}_ds{downsample}_frag{frag_distro}.bam", log: logdir + "/{library}_ds{downsample}_{frag_distro}_cfdna_wgs_bam_to_wig.log", output: cfdna_wgs_wigs + "/{library}_ds{downsample}_frag{frag_distro}.wig", params: chrs = chrs, outdir = cfdna_wgs_wigs, script = cfdna_wgs_scriptdir + "/bam_to_wig.sh", threads = cfdna_wgs_threads, shell: """ mkdir -p {params.outdir} /opt/hmmcopy_utils/bin/readCounter \ --chromosome "{params.chrs}" \ --quality 20 \ --window 1000000 \ {input} > {output} """
-
Shell script
#!/usr/bin/env bash input=$1 output=$2 /opt/hmmcopy_utils/bin/readCounter --window 1000000 --quality 20 \ --chromosome {params.chrs} \ {input} > {output}
-
Snakemake
# Run ichorCNA without a panel of normals rule ichor_nopon: input: cfdna_wgs_wigs + "/{library}_ds{downsample}_frag{frag_distro}.wig", output: cfdna_wgs_ichor_nopon + "/{library}_ds{downsample}_frag{frag_distro}.cna.seg", params: script = cfdna_wgs_scriptdir + "/MOD_runIchorCNA.R", out_dir = cfdna_wgs_ichor_nopon, container: cfdna_wgs_container, shell: """ Rscript {params.script} \ --id {wildcards.library}_frag{wildcards.frag_distro} \ --WIG {input} \ --gcWig /opt/ichorCNA/inst/extdata/gc_hg38_1000kb.wig \ --mapWig /opt/ichorCNA/inst/extdata/map_hg38_1000kb.wig \ --centromere /opt/ichorCNA/inst/extdata/GRCh38.GCA_000001405.2_centromere_acen.txt \ --normal "c(0.95, 0.99, 0.995, 0.999)" \ --ploidy "c(2)" \ --maxCN 3 \ --estimateScPrevalence FALSE \ --scStates "c()" \ --outDir {params.out_dir} """
- Rscript
# file: ichorCNA.R # authors: Gavin Ha, Ph.D. # Fred Hutch # contact: <gha@fredhutch.org> # # Justin Rhoades # Broad Institute # contact: <rhoades@broadinstitute.org> # ichorCNA: https://github.com/broadinstitute/ichorCNA # date: July 24, 2019 # description: Hidden Markov model (HMM) to analyze Ultra-low pass whole genome sequencing (ULP-WGS) data. # This script is the main script to run the HMM. library(optparse) option_list <- list( make_option(c("--WIG"), type = "character", help = "Path to tumor WIG file. Required."), make_option(c("--NORMWIG"), type = "character", default=NULL, help = "Path to normal WIG file. Default: [%default]"), make_option(c("--gcWig"), type = "character", help = "Path to GC-content WIG file; Required"), make_option(c("--mapWig"), type = "character", default=NULL, help = "Path to mappability score WIG file. Default: [%default]"), make_option(c("--normalPanel"), type="character", default=NULL, help="Median corrected depth from panel of normals. Default: [%default]"), make_option(c("--exons.bed"), type = "character", default=NULL, help = "Path to bed file containing exon regions. Default: [%default]"), make_option(c("--id"), type = "character", default="test", help = "Patient ID. Default: [%default]"), make_option(c("--centromere"), type="character", default=NULL, help = "File containing Centromere locations; if not provided then will use hg19 version from ichorCNA package. Default: [%default]"), make_option(c("--minMapScore"), type = "numeric", default=0.9, help="Include bins with a minimum mappability score of this value. Default: [%default]."), make_option(c("--rmCentromereFlankLength"), type="numeric", default=1e5, help="Length of region flanking centromere to remove. Default: [%default]"), make_option(c("--normal"), type="character", default="0.5", help = "Initial normal contamination; can be more than one value if additional normal initializations are desired. Default: [%default]"), make_option(c("--scStates"), type="character", default="NULL", help = "Subclonal states to consider. Default: [%default]"), make_option(c("--coverage"), type="numeric", default=NULL, help = "PICARD sequencing coverage. Default: [%default]"), make_option(c("--lambda"), type="character", default="NULL", help="Initial Student's t precision; must contain 4 values (e.g. c(1500,1500,1500,1500)); if not provided then will automatically use based on variance of data. Default: [%default]"), make_option(c("--lambdaScaleHyperParam"), type="numeric", default=3, help="Hyperparameter (scale) for Gamma prior on Student's-t precision. Default: [%default]"), # make_option(c("--kappa"), type="character", default=50, help="Initial state distribution"), make_option(c("--ploidy"), type="character", default="2", help = "Initial tumour ploidy; can be more than one value if additional ploidy initializations are desired. Default: [%default]"), make_option(c("--maxCN"), type="numeric", default=7, help = "Total clonal CN states. Default: [%default]"), make_option(c("--estimateNormal"), type="logical", default=TRUE, help = "Estimate normal. Default: [%default]"), make_option(c("--estimateScPrevalence"), type="logical", default=TRUE, help = "Estimate subclonal prevalence. Default: [%default]"), make_option(c("--estimatePloidy"), type="logical", default=TRUE, help = "Estimate tumour ploidy. Default: [%default]"), make_option(c("--maxFracCNASubclone"), type="numeric", default=0.7, help="Exclude solutions with fraction of subclonal events greater than this value. Default: [%default]"), make_option(c("--maxFracGenomeSubclone"), type="numeric", default=0.5, help="Exclude solutions with subclonal genome fraction greater than this value. Default: [%default]"), make_option(c("--minSegmentBins"), type="numeric", default=50, help="Minimum number of bins for largest segment threshold required to estimate tumor fraction; if below this threshold, then will be assigned zero tumor fraction."), make_option(c("--altFracThreshold"), type="numeric", default=0.05, help="Minimum proportion of bins altered required to estimate tumor fraction; if below this threshold, then will be assigned zero tumor fraction. Default: [%default]"), make_option(c("--chrNormalize"), type="character", default="c(1:22)", help = "Specify chromosomes to normalize GC/mappability biases. Default: [%default]"), make_option(c("--chrTrain"), type="character", default="c(1:22)", help = "Specify chromosomes to estimate params. Default: [%default]"), make_option(c("--chrs"), type="character", default="c(1:22,\"X\")", help = "Specify chromosomes to analyze. Default: [%default]"), make_option(c("--genomeBuild"), type="character", default="hg19", help="Geome build. Default: [%default]"), make_option(c("--genomeStyle"), type = "character", default = "NCBI", help = "NCBI or UCSC chromosome naming convention; use UCSC if desired output is to have \"chr\" string. [Default: %default]"), make_option(c("--normalizeMaleX"), type="logical", default=TRUE, help = "If male, then normalize chrX by median. Default: [%default]"), make_option(c("--minTumFracToCorrect"), type="numeric", default=0.1, help = "Tumor-fraction correction of bin and segment-level CNA if sample has minimum estimated tumor fraction. [Default: %default]"), make_option(c("--fracReadsInChrYForMale"), type="numeric", default=0.001, help = "Threshold for fraction of reads in chrY to assign as male. Default: [%default]"), make_option(c("--includeHOMD"), type="logical", default=FALSE, help="If FALSE, then exclude HOMD state. Useful when using large bins (e.g. 1Mb). Default: [%default]"), make_option(c("--txnE"), type="numeric", default=0.9999999, help = "Self-transition probability. Increase to decrease number of segments. Default: [%default]"), make_option(c("--txnStrength"), type="numeric", default=1e7, help = "Transition pseudo-counts. Exponent should be the same as the number of decimal places of --txnE. Default: [%default]"), make_option(c("--plotFileType"), type="character", default="pdf", help = "File format for output plots. Default: [%default]"), make_option(c("--plotYLim"), type="character", default="c(-2,2)", help = "ylim to use for chromosome plots. Default: [%default]"), make_option(c("--outDir"), type="character", default="./", help = "Output Directory. Default: [%default]"), make_option(c("--libdir"), type = "character", default=NULL, help = "Script library path. Usually exclude this argument unless custom modifications have been made to the ichorCNA R package code and the user would like to source those R files. Default: [%default]") ) parseobj <- OptionParser(option_list=option_list) opt <- parse_args(parseobj) print(opt) options(scipen=0, stringsAsFactors=F) library(HMMcopy) library(GenomicRanges) library(GenomeInfoDb) options(stringsAsFactors=FALSE) options(bitmapType='cairo') patientID <- opt$id tumour_file <- opt$WIG normal_file <- opt$NORMWIG gcWig <- opt$gcWig mapWig <- opt$mapWig normal_panel <- opt$normalPanel exons.bed <- opt$exons.bed # "0" if none specified centromere <- opt$centromere minMapScore <- opt$minMapScore flankLength <- opt$rmCentromereFlankLength normal <- eval(parse(text = opt$normal)) scStates <- eval(parse(text = opt$scStates)) lambda <- eval(parse(text = opt$lambda)) lambdaScaleHyperParam <- opt$lambdaScaleHyperParam estimateNormal <- opt$estimateNormal estimatePloidy <- opt$estimatePloidy estimateScPrevalence <- opt$estimateScPrevalence maxFracCNASubclone <- opt$maxFracCNASubclone maxFracGenomeSubclone <- opt$maxFracGenomeSubclone minSegmentBins <- opt$minSegmentBins altFracThreshold <- opt$altFracThreshold ploidy <- eval(parse(text = opt$ploidy)) coverage <- opt$coverage maxCN <- opt$maxCN txnE <- opt$txnE txnStrength <- opt$txnStrength normalizeMaleX <- as.logical(opt$normalizeMaleX) includeHOMD <- as.logical(opt$includeHOMD) minTumFracToCorrect <- opt$minTumFracToCorrect fracReadsInChrYForMale <- opt$fracReadsInChrYForMale chrXMedianForMale <- -0.1 outDir <- opt$outDir libdir <- opt$libdir plotFileType <- opt$plotFileType plotYLim <- eval(parse(text=opt$plotYLim)) gender <- NULL outImage <- paste0(outDir,"/", patientID,".RData") genomeBuild <- opt$genomeBuild genomeStyle <- opt$genomeStyle chrs <- as.character(eval(parse(text = opt$chrs))) chrTrain <- as.character(eval(parse(text=opt$chrTrain))); chrNormalize <- as.character(eval(parse(text=opt$chrNormalize))); seqlevelsStyle(chrs) <- genomeStyle seqlevelsStyle(chrNormalize) <- genomeStyle seqlevelsStyle(chrTrain) <- genomeStyle ## load ichorCNA library or source R scripts if (!is.null(libdir) && libdir != "None"){ source(paste0(libdir,"/R/utils.R")) source(paste0(libdir,"/R/segmentation.R")) source(paste0(libdir,"/R/EM.R")) source(paste0(libdir,"/R/output.R")) source(paste0(libdir,"/R/plotting.R")) } else { library(ichorCNA) } ## load seqinfo seqinfo <- getSeqInfo(genomeBuild, genomeStyle) if (substr(tumour_file,nchar(tumour_file)-2,nchar(tumour_file)) == "wig") { wigFiles <- data.frame(cbind(patientID, tumour_file)) } else { wigFiles <- read.delim(tumour_file, header=F, as.is=T) } ## FILTER BY EXONS IF PROVIDED ## ## add gc and map to GRanges object ## if (is.null(exons.bed) || exons.bed == "None" || exons.bed == "NULL"){ targetedSequences <- NULL }else{ targetedSequences <- read.delim(exons.bed, header=T, sep="\t") } ## load PoN if (is.null(normal_panel) || normal_panel == "None" || normal_panel == "NULL"){ normal_panel <- NULL } if (is.null(centromere) || centromere == "None" || centromere == "NULL"){ # no centromere file provided centromere <- system.file("extdata", "GRCh37.p13_centromere_UCSC-gapTable.txt", package = "ichorCNA") } centromere <- read.delim(centromere,header=T,stringsAsFactors=F,sep="\t") save.image(outImage) ## LOAD IN WIG FILES ## numSamples <- nrow(wigFiles) tumour_copy <- list() for (i in 1:numSamples) { id <- wigFiles[i,1] ## create output directories for each sample ## dir.create(paste0(outDir, "/", id, "/"), recursive = TRUE) ### LOAD TUMOUR AND NORMAL FILES ### message("Loading tumour file:", wigFiles[i,1]) tumour_reads <- wigToGRanges(wigFiles[i,2]) ## LOAD GC/MAP WIG FILES ### # find the bin size and load corresponding wig files # binSize <- as.data.frame(tumour_reads[1,])$width message("Reading GC and mappability files") if (is.null(gcWig) || gcWig == "None" || gcWig == "NULL"){ stop("GC wig file is required") } gc <- wigToGRanges(gcWig) if (is.null(mapWig) || mapWig == "None" || mapWig == "NULL"){ message("No mappability wig file input, excluding from correction") map <- NULL } else { map <- wigToGRanges(mapWig) } message("Correcting Tumour") counts <- loadReadCountsFromWig(tumour_reads, chrs = chrs, gc = gc, map = map, centromere = centromere, flankLength = flankLength, targetedSequences = targetedSequences, chrXMedianForMale = chrXMedianForMale, genomeStyle = genomeStyle, fracReadsInChrYForMale = fracReadsInChrYForMale, chrNormalize = chrNormalize, mapScoreThres = minMapScore) tumour_copy[[id]] <- counts$counts #as(counts$counts, "GRanges") gender <- counts$gender ## load in normal file if provided if (!is.null(normal_file) && normal_file != "None" && normal_file != "NULL"){ message("Loading normal file:", normal_file) normal_reads <- wigToGRanges(normal_file) message("Correcting Normal") counts <- loadReadCountsFromWig(normal_reads, chrs=chrs, gc=gc, map=map, centromere=centromere, flankLength = flankLength, targetedSequences=targetedSequences, genomeStyle = genomeStyle, chrNormalize = chrNormalize, mapScoreThres = minMapScore) normal_copy <- counts$counts #as(counts$counts, "GRanges") gender.normal <- counts$gender }else{ normal_copy <- NULL } ### DETERMINE GENDER ### ## if normal file not given, use chrY, else use chrX message("Determining gender...", appendLF = FALSE) gender.mismatch <- FALSE if (!is.null(normal_copy)){ if (gender$gender != gender.normal$gender){ #use tumour # use normal if given # check if normal is same gender as tumour gender.mismatch <- TRUE } } message("Gender ", gender$gender) ## NORMALIZE GENOME-WIDE BY MATCHED NORMAL OR NORMAL PANEL (MEDIAN) ## tumour_copy[[id]] <- normalizeByPanelOrMatchedNormal(tumour_copy[[id]], chrs = chrs, normal_panel = normal_panel, normal_copy = normal_copy, gender = gender$gender, normalizeMaleX = normalizeMaleX) ### OUTPUT FILE ### ### PUTTING TOGETHER THE COLUMNS IN THE OUTPUT ### outMat <- as.data.frame(tumour_copy[[id]]) #outMat <- outMat[,c(1,2,3,12)] outMat <- outMat[,c("seqnames","start","end","copy")] colnames(outMat) <- c("chr","start","end","log2_TNratio_corrected") outFile <- paste0(outDir,"/",id,".correctedDepth.txt") message(paste("Outputting to:", outFile)) write.table(outMat, file=outFile, row.names=F, col.names=T, quote=F, sep="\t") } ## end of for each sample chrInd <- as.character(seqnames(tumour_copy[[1]])) %in% chrTrain ## get positions that are valid valid <- tumour_copy[[1]]$valid if (length(tumour_copy) >= 2) { for (i in 2:length(tumour_copy)){ valid <- valid & tumour_copy[[i]]$valid } } save.image(outImage) ### RUN HMM ### ## store the results for different normal and ploidy solutions ## ptmTotalSolutions <- proc.time() # start total timer results <- list() loglik <- as.data.frame(matrix(NA, nrow = length(normal) * length(ploidy), ncol = 7, dimnames = list(c(), c("init", "n_est", "phi_est", "BIC", "Frac_genome_subclonal", "Frac_CNA_subclonal", "loglik")))) counter <- 1 compNames <- rep(NA, nrow(loglik)) mainName <- rep(NA, length(normal) * length(ploidy)) #### restart for purity and ploidy values #### for (n in normal){ for (p in ploidy){ if (n == 0.95 & p != 2) { next } logR <- as.data.frame(lapply(tumour_copy, function(x) { x$copy })) # NEED TO EXCLUDE CHR X # param <- getDefaultParameters(logR[valid & chrInd, , drop=F], maxCN = maxCN, includeHOMD = includeHOMD, ct.sc=scStates, ploidy = floor(p), e=txnE, e.same = 50, strength=txnStrength) param$phi_0 <- rep(p, numSamples) param$n_0 <- rep(n, numSamples) ############################################ ######## CUSTOM PARAMETER SETTINGS ######### ############################################ # 0.1x cfDNA # if (is.null(lambda)){ logR.var <- 1 / ((apply(logR, 2, sd, na.rm = TRUE) / sqrt(length(param$ct))) ^ 2) param$lambda <- rep(logR.var, length(param$ct)) param$lambda[param$ct %in% c(2)] <- logR.var param$lambda[param$ct %in% c(1,3)] <- logR.var param$lambda[param$ct >= 4] <- logR.var / 5 param$lambda[param$ct == max(param$ct)] <- logR.var / 15 param$lambda[param$ct.sc.status] <- logR.var / 10 }else{ param$lambda[param$ct %in% c(2)] <- lambda[2] param$lambda[param$ct %in% c(1)] <- lambda[1] param$lambda[param$ct %in% c(3)] <- lambda[3] param$lambda[param$ct >= 4] <- lambda[4] param$lambda[param$ct == max(param$ct)] <- lambda[2] / 15 param$lambda[param$ct.sc.status] <- lambda[2] / 10 } param$alphaLambda <- rep(lambdaScaleHyperParam, length(param$ct)) # 1x bulk tumors # #param$lambda[param$ct %in% c(2)] <- 2000 #param$lambda[param$ct %in% c(1)] <- 1750 #param$lambda[param$ct %in% c(3)] <- 1750 #param$lambda[param$ct >= 4] <- 1500 #param$lambda[param$ct == max(param$ct)] <- 1000 / 25 #param$lambda[param$ct.sc.status] <- 1000 / 75 #param$alphaLambda[param$ct.sc.status] <- 4 #param$alphaLambda[param$ct %in% c(1,3)] <- 5 #param$alphaLambda[param$ct %in% c(2)] <- 5 #param$alphaLambda[param$ct == max(param$ct)] <- 4 ############################################# ################ RUN HMM #################### ############################################# hmmResults.cor <- HMMsegment(tumour_copy, valid, dataType = "copy", param = param, chrTrain = chrTrain, maxiter = 50, estimateNormal = estimateNormal, estimatePloidy = estimatePloidy, estimateSubclone = estimateScPrevalence, verbose = TRUE) for (s in 1:numSamples){ iter <- hmmResults.cor$results$iter id <- names(hmmResults.cor$cna)[s] ## convert full diploid solution (of chrs to train) to have 1.0 normal or 0.0 purity ## check if there is an altered segment that has at least a minimum # of bins segsS <- hmmResults.cor$results$segs[[s]] segsS <- segsS[segsS$chr %in% chrTrain, ] segAltInd <- which(segsS$event != "NEUT") maxBinLength = -Inf if (sum(segAltInd) > 0){ maxInd <- which.max(segsS$end[segAltInd] - segsS$start[segAltInd] + 1) maxSegRD <- GRanges(seqnames=segsS$chr[segAltInd[maxInd]], ranges=IRanges(start=segsS$start[segAltInd[maxInd]], end=segsS$end[segAltInd[maxInd]])) hits <- findOverlaps(query=maxSegRD, subject=tumour_copy[[s]][valid, ]) maxBinLength <- length(subjectHits(hits)) } ## check if there are proportion of total bins altered # if segment size smaller than minSegmentBins, but altFrac > altFracThreshold, then still estimate TF cnaS <- hmmResults.cor$cna[[s]] altInd <- cnaS[cnaS$chr %in% chrTrain, "event"] == "NEUT" altFrac <- sum(!altInd, na.rm=TRUE) / length(altInd) if ((maxBinLength <= minSegmentBins) & (altFrac <= altFracThreshold)){ hmmResults.cor$results$n[s, iter] <- 1.0 } # correct integer copy number based on estimated purity and ploidy correctedResults <- correctIntegerCN(cn = hmmResults.cor$cna[[s]], segs = hmmResults.cor$results$segs[[s]], purity = 1 - hmmResults.cor$results$n[s, iter], ploidy = hmmResults.cor$results$phi[s, iter], cellPrev = 1 - hmmResults.cor$results$sp[s, iter], maxCNtoCorrect.autosomes = maxCN, maxCNtoCorrect.X = maxCN, minPurityToCorrect = minTumFracToCorrect, gender = gender$gender, chrs = chrs, correctHOMD = includeHOMD) hmmResults.cor$results$segs[[s]] <- correctedResults$segs hmmResults.cor$cna[[s]] <- correctedResults$cn ## plot solution ## outPlotFile <- paste0(outDir, "/", id, "/", id, "_genomeWide_", "n", n, "-p", p) mainName[counter] <- paste0(id, ", n: ", n, ", p: ", p, ", log likelihood: ", signif(hmmResults.cor$results$loglik[hmmResults.cor$results$iter], digits = 4)) plotGWSolution(hmmResults.cor, s=s, outPlotFile=outPlotFile, plotFileType=plotFileType, logR.column = "logR", call.column = "Corrected_Call", plotYLim=plotYLim, estimateScPrevalence=estimateScPrevalence, seqinfo=seqinfo, main=mainName[counter]) } iter <- hmmResults.cor$results$iter results[[counter]] <- hmmResults.cor loglik[counter, "loglik"] <- signif(hmmResults.cor$results$loglik[iter], digits = 4) subClonalBinCount <- unlist(lapply(hmmResults.cor$cna, function(x){ sum(x$subclone.status) })) fracGenomeSub <- subClonalBinCount / unlist(lapply(hmmResults.cor$cna, function(x){ nrow(x) })) fracAltSub <- subClonalBinCount / unlist(lapply(hmmResults.cor$cna, function(x){ sum(x$copy.number != 2) })) fracAltSub <- lapply(fracAltSub, function(x){if (is.na(x)){0}else{x}}) loglik[counter, "Frac_genome_subclonal"] <- paste0(signif(fracGenomeSub, digits=2), collapse=",") loglik[counter, "Frac_CNA_subclonal"] <- paste0(signif(as.numeric(fracAltSub), digits=2), collapse=",") loglik[counter, "init"] <- paste0("n", n, "-p", p) loglik[counter, "n_est"] <- paste(signif(hmmResults.cor$results$n[, iter], digits = 2), collapse = ",") loglik[counter, "phi_est"] <- paste(signif(hmmResults.cor$results$phi[, iter], digits = 4), collapse = ",") counter <- counter + 1 } } ## get total time for all solutions ## elapsedTimeSolutions <- proc.time() - ptmTotalSolutions message("Total ULP-WGS HMM Runtime: ", format(elapsedTimeSolutions[3] / 60, digits = 2), " min.") ### SAVE R IMAGE ### save.image(outImage) #save(tumour_copy, results, loglik, file=paste0(outDir,"/",id,".RData")) ### SELECT SOLUTION WITH LARGEST LIKELIHOOD ### loglik <- loglik[!is.na(loglik$init), ] if (estimateScPrevalence){ ## sort but excluding solutions with too large % subclonal fracInd <- which(loglik[, "Frac_CNA_subclonal"] <= maxFracCNASubclone & loglik[, "Frac_genome_subclonal"] <= maxFracGenomeSubclone) if (length(fracInd) > 0){ ## if there is a solution satisfying % subclonal ind <- fracInd[order(loglik[fracInd, "loglik"], decreasing=TRUE)] }else{ # otherwise just take largest likelihood ind <- order(as.numeric(loglik[, "loglik"]), decreasing=TRUE) } }else{#sort by likelihood only ind <- order(as.numeric(loglik[, "loglik"]), decreasing=TRUE) } #new loop by order of solutions (ind) outPlotFile <- paste0(outDir, "/", id, "/", id, "_genomeWide_all_sols") for(i in 1:length(ind)) { hmmResults.cor <- results[[ind[i]]] turnDevOff <- FALSE turnDevOn <- FALSE if (i == 1){ turnDevOn <- TRUE } if (i == length(ind)){ turnDevOff <- TRUE } plotGWSolution(hmmResults.cor, s=s, outPlotFile=outPlotFile, plotFileType="pdf", logR.column = "logR", call.column = "Corrected_Call", plotYLim=plotYLim, estimateScPrevalence=estimateScPrevalence, seqinfo = seqinfo, turnDevOn = turnDevOn, turnDevOff = turnDevOff, main=mainName[ind[i]]) } hmmResults.cor <- results[[ind[1]]] hmmResults.cor$results$loglik <- as.data.frame(loglik) hmmResults.cor$results$gender <- gender$gender hmmResults.cor$results$chrYCov <- gender$chrYCovRatio hmmResults.cor$results$chrXMedian <- gender$chrXMedian hmmResults.cor$results$coverage <- coverage outputHMM(cna = hmmResults.cor$cna, segs = hmmResults.cor$results$segs, results = hmmResults.cor$results, patientID = patientID, outDir=outDir) outFile <- paste0(outDir, "/", patientID, ".params.txt") outputParametersToFile(hmmResults.cor, file = outFile)
-
Snakemake
# Filter fragments by length rule cna_frag_filt_tmp: benchmark: benchdir + "/{library}_{frag_distro}_cfdna_wgs_frag_filt.benchmark.txt", container: cfdna_wgs_container, input: cfdna_wgs_cna_in_bams + "/{library}.bam", log: logdir + "/{library}_{frag_distro}_cfdna_wgs_frag_filt.log", output: nohead = temp(cfdna_wgs_cna_frag_bams) + "/{library}_frag{frag_distro}.nohead", onlyhead = temp(cfdna_wgs_cna_frag_bams) + "/{library}_frag{frag_distro}.only", final = cfdna_wgs_cna_frag_bams + "/{library}_frag{frag_distro}.bam", params: script = cfdna_wgs_scriptdir + "/frag_filt.sh", threads = cfdna_wgs_threads, shell: """ frag_min=$(echo {wildcards.frag_distro} | sed -e "s/_.*$//g") frag_max=$(echo {wildcards.frag_distro} | sed -e "s/^.*_//g") {params.script} \ {input} \ {output.nohead} \ $frag_min \ $frag_max \ {config[threads]} \ {output.onlyhead} \ {output.final} """
note- this would require re-write of ds_cond_target_list
- Make a smaller fasta for indexing
#!/bin/echo For documentation, not intended to be executable:. singularity shell ~/sing_containers/biotools.1.0.2.sif repo=/home/jeszyman/repos/cfdna-wgs wget --directory-prefix="${repo}/test/inputs/" https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz zcat "test/inputs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz" | grep -A 1000000 chr8 | gzip > test/inputs/chr8.fa.gz # Test indexed size mkdir -p /tmp/testbwa bwa index -p /tmp/testbwa/chr8 test/inputs/chr8.fa.gz rm ${repo}/test/inputs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
- Make chr8-specific ichor references
singularity shell ~/sing_containers/cfdna_wgs.1.0.0.sif ~/wigToBigWig -clip /opt/ichorCNA/inst/extdata/gc_hg38_1000kb.wig test/inputs/hg38.chrom.sizes test/inputs/gc_chr8_1000kb.bw bigWigToWig -chrom=chr8 test/inputs/gc_chr8_1000kb.bw test/inputs/gc_chr8_1000kb.wig ~/wigToBigWig -clip /opt/ichorCNA/inst/extdata/map_hg38_1000kb.wig test/inputs/hg38.chrom.sizes test/inputs/map_chr8_1000kb.bw bigWigToWig -chrom=chr8 test/inputs/map_chr8_1000kb.bw test/inputs/map_chr8_1000kb.wig ~/wigToBigWig -clip /opt/ichorCNA/inst/extdata/gc_hg38_1000kb.wig test/inputs/hg38.chrom.sizes test/inputs/gc_chr8_1000kb.bw bigWigToWig -chrom=chr8 test/inputs/gc_chr8_1000kb.bw test/inputs/gc_chr8_1000kb.wig
- ?
wget --directory-prefix="/home/jeszyman/repos/cfdna-wgs/test/inputs" https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.chrom.sizes wget --directory-prefix="/home/jeszyman/repos/cfdna-wgs/test/inputs" https://raw.githubusercontent.com/Boyle-Lab/Blacklist/master/lists/hg38-blacklist.v2.bed.gz gunzip -c ~/repos/cfdna-wgs/test/inputs/hg38-blacklist.v2.bed.gz > ~/repos/cfdna-wgs/test/inputs/hg38-blacklist.v2.bed
singularity shell --bind /mnt ~/sing_containers/cfdna_wgs.1.0.0.sif # Clear bam directory if present if [ -r test/bam ]; then \rm -rf test/bam; fi mkdir -p test/bam # Create small bam files to store in repo. Subsample real bams to ~100 Mb. sambamba view -s .005 -f bam -t 36 /mnt/ris/aadel/Active/mpnst/test/bam/new_HiSeq15_L002001_ACAC_extract_ds20.bam > test/inputs/lib003_hg38.bam sambamba view -s .005 -f bam -t 36 /mnt/ris/aadel/Active/mpnst/test/bam/new_HiSeq15_L002001_ATCG_extract_ds20.bam > test/inputs/lib004_hg38.bam sambamba view -s 0.01 -f bam -t 4 /mnt/ris/aadel/Active/mpnst/bam/cfdna_wgs/ds/lib105_ds10.bam > test/inputs/lib005.bam sambamba view -s 0.01 -f bam -t 4 /mnt/ris/aadel/Active/mpnst/bam/cfdna_wgs/ds/lib205_ds10.bam > test/inputs/lib006.bam for file in test/inputs/*.bam; do samtools index $file; done
mkdir -p ~/repos/cfdna-wgs/test/analysis/cfdna_frag_bams cp ~/repos/cfdna-frag/test/bam/frag/*.bam ~/repos/cfdna-wgs/test/analysis/cfdna_frag_bams/
singularity shell --bind /mnt ~/sing_containers/cfdna_wgs.1.0.0.sif # Get hg38 gc bigwig wget --directory-prefix /tmp/ http://hgdownload.cse.ucsc.edu/gbdb/hg38/bbi/gc5BaseBw/gc5Base.bw # convert hg38 gc bigwig to tsv binned at 5 Mb (like Mathios, 2021) multiBigwigSummary bins \ --binSize 5000000 \ --bwfiles /tmp/gc5Base.bw \ --numberOfProcessors 4 \ --outFileName /tmp/test.out \ --outRawCounts /tmp/gc5mb.tsv tail -n +2 /tmp/gc5mb.tsv > test/inputs/gc5mb.bed
- bedtools subtract -a “test/inputs/chr8.bed” -b “test/inputs/hg38-blacklist.v2.bed” > “test/inputs/keep.bed”
- Find reads that map to early 8 and work back into a fastq
- cite:dehner2021
- using mosdepth
#########1#########2#########3#########4#########5#########6#########7#########8 # ### mosdepth for WGS depth calc ### # # Setup ## # Mosdepth per bam dir ## ## For deduped bams for file in $localdata/bams/*.dedup.sorted.bam; do mosdepth_mpnst $file $localdata/bam-qc/dedup 250000000 done ## # # get simple tsv and send to repo for file in $localdata/bam-qc/dedup/lib*.regions.bed.gz; do base=`basename -s .dedup.sorted.regions.bed.gz $file` zcat $file | awk -v FS='\t' -v var=$base 'NR <=24 {print var,$1,$4}' >> $localdata/bam-qc/dedup/all_dedup_coverage done header=library_id\\tchr\\tmean_coverage sed -i "1 i$header" $localdata/bam-qc/dedup/all_dedup_coverage ## Local >>>>>>> 2d6bf2d62424a76f5893600fce7444a867784228 source ~/repos/mpnst/bin/local-setup.sh docker_interactive biotools ## ## Functions ### ### Convert bams to wigs bam_to_wig() { printf "Variables are: 1=bam_file 2=bam_suffix 3=outdir\n" base=`basename -s ${2} $1` if [ $3/${base}.wig -ot $1 ]; then /opt/hmmcopy_utils/bin/readCounter --window 1000000 --quality 20 \ --chromosome "chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY" $1 > $3/${base}.wig fi } ### ### Run ichor for low TF ichor_lowfract() { base=`basename -s .wig $1` if [ $2/$base.RData -ot $1 ]; then Rscript /opt/ichorCNA/scripts/runIchorCNA.R \ --id $base \ --WIG $1 \ --gcWig /opt/ichorCNA/inst/extdata/gc_hg19_1000kb.wig \ --normal "c(0.95, 0.99, 0.995, 0.999)" \ --ploidy "c(2)" \ --maxCN 3 \ --estimateScPrevalence FALSE \ --scStates "c()" \ --outDir $2 fi } ## ## mkdir -p $localdata/wigs mkdir -p $localdata/ichor # # Make wigs # #bam_to_wig /mnt/xt3/mpnst/frag-filt-bams/lib109.dedup.sorted.frag90_150.sorted.bam .dedup.sorted.frag90_150.sorted.bam $localdata/wigs ## for file in $localdata/frag-filt-bams/lib109*.bam; do bam_to_wig $file \ .dedup.sorted.frag.sorted.bam \ $localdata/wigs done ## For fraction-filtered WGS cfDNA for file in $localdata/frag-filt-bams/*.bam; do bam_to_wig $file \ .dedup.sorted.frag.sorted.bam \ $localdata/wigs done ## ## For tumor and leukocyte WGS libraries ### Make array of genomic library file paths genomic=($(cat /drive3/users/jszymanski/repos/mpnst/data/libraries.csv | grep -e tumor -e leukocyte | grep -v "wes" | awk -F, '{print $1}' | sed 's/"//g' | sed 's/$/.dedup.sorted.bam/g' | sed 's/^/\/mnt\/xt3\/mpnst\/bams\//g')) ### for file in ${genomic[@]}; do bam_to_wig $file \ .dedup.sorted.bam \ $localdata/wigs done # ## ## Send successful file list to repo rm /drive3/users/jszymanski/repos/mpnst/data/wigs.tsv for file in $localdata/wigs/*.wig; do base=`basename -s .wig $file` echo $base >> /drive3/users/jszymanski/repos/mpnst/data/wigs.tsv done # ##RESUME HERE # ichor ## for file in $localdata/wigs/lib109*.wig; do ichor_lowfract $file $localdata/ichor done header=library_id\\tchr\\tmean_coverage sed -i "1 i$header" $localdata/bam-qc/dedup/all_dedup_coverage max_file_size=5000000 file_size=$( wc -c <"$localdata/bam-qc/dedup/all_dedup_coverage" ) if [ $filesize -gt $max_file_size ]; then touch $repo/data/qc/all_dedup_coverage_too_big else cp $localdata/bam-qc/dedup/all_dedup_coverage $repo/qc/all_dedup_coverage.tsv fi #
- Cant calcualte depths off ~/repos/mpnst/data/bam_qc_data/mqc_mosdepth-coverage-per-contig_1.txt , d/n allow values under 1
- [ ] for coverage, should intersect down to autosomes
- https://github.com/brentp/mosdepth
- run and extract mosdepth mosdepthRAW = as_tibble(read.table(file.path(repo,”data/all_dedup_coverage.tsv”), header = T, sep = ‘\t’, fill = TRUE))
-
Snakemake
# Downsample bam file to a set number of reads rule cfdna_wgs_downsample: benchmark: benchdir + "/{library}_{milreads}_cfdna_wgs_downsample.benchmark.txt", container: cfdna_wgs_container, input: cfdna_wgs_bams + "/{library}_filt.bam", log: logdir + "/{library}_{milreads}_cfdna_wgs_downsample.log", output: cfdna_wgs_bams + "/{library}_ds{milreads}.bam", params: milreads = MILREADS, script = cfdna_wgs_scriptdir + "/downsample.sh", threads = cfdna_wgs_threads, shell: """ {params.script} \ {input} \ {wildcards.milreads} \ {output} &> {log} """
-
Shell script
#!/usr/bin/env bash # For unit testing # in_bam="test/analysis/cfdna_wgs_bams/lib001_filt.bam" # out_bam=/tmp/test.bam # milreads="0.0041" in_bam=$1 milreads="$2" out_bam=$3 reads=$(echo |awk -v var1=$milreads '{ print 1000000*var1 }') ## Calculate the sampling factor based on the intended number of reads: FACTOR=$(samtools idxstats $in_bam | cut -f3 | awk -v COUNT=$reads 'BEGIN {total=0} {total += $1} END {print COUNT/total}') if [[ $FACTOR > 1 ]]; then echo "DS reads exceeds total for $in_bam" else sambamba view -s $FACTOR -f bam -l 5 $in_bam > $out_bam fi
# Alignment deduplication and sorting
rule alignment_processing:
input:
config["datadir"] + "/bam/{library_id}_raw.bam",
output:
dedup = temp(config["datadir"] + "/bam/{library_id}_dedup_unsort.bam"),
sort = config["datadir"] + "/bam/{library_id}_dedup.bam",
index = config["datadir"] + "/bam/{library_id}_dedup.bam.bai",
log:
config["datadir"] + "/logs/alignment_processing_{library_id}.log"
shell:
"""
{config[cfdna_wgs_script_dir]}/alignment_processing.sh \
{input} \
{config[threads]} \
{output.bam} \
{output.dedup} \
{output.sort} \
{output.index} \
&> {log}
"""
-
Script
#!/usr/bin/env bash <#bash_preamble#> input=$1 threads=$2 output_bam=$3 output_dedup=$4 output_sort=$5 output_index=$6 sambamba view -t $threads -S -f bam $input > $output_bam sambamba markdup -r -t $threads $output_bam $output_dedup sambamba sort -t $threads $output_dedup -o $output_sort sambamba index -t $threads $output_sort