Cell-free DNA Whole-Genome Sequencing

:header-args+: :noweb yes

Setup inputs for testing

test/inputs/libraries.tsv
test/inputs/cna_libraries.tsv

Get my fastq inputs for read process & testing

#!/bin/echo For documentation, not intended to be executable:.
mntpt=/mnt/ris/aadel/Active
singularity shell --bind ${mntpt} ~/sing_containers/biotools.1.0.2.sif

repo=/home/jeszyman/repos/cfdna-wgs
mntpt=/mnt/ris/aadel/Active
mkdir -p ${repo}/test/inputs

fq_size=100000
zcat ${mntpt}/mpnst/inputs/seq/MPNST/19_2_082_R1.fastq.gz | head -n $fq_size > ${repo}/test/inputs/mpnst1_R1.fastq
zcat ${mntpt}/mpnst/inputs/seq/MPNST/19_2_082_R2.fastq.gz | head -n $fq_size > ${repo}/test/inputs/mpnst1_R2.fastq
zcat ${mntpt}/mpnst/inputs/seq/MPNST/25_2_072_R1.fastq.gz | head -n $fq_size > ${repo}/test/inputs/mpnst2_R1.fastq
zcat ${mntpt}/mpnst/inputs/seq/MPNST/25_2_072_R2.fastq.gz | head -n $fq_size > ${repo}/test/inputs/mpnst2_R2.fastq
zcat ${mntpt}/mpnst/inputs/seq/PN/37_JS0050CD112717_R1.fastq.gz | head -n $fq_size > ${repo}/test/inputs/plex1_R1.fastq
zcat ${mntpt}/mpnst/inputs/seq/PN/37_JS0050CD112717_R2.fastq.gz | head -n $fq_size > ${repo}/test/inputs/plex1_R2.fastq
zcat ${mntpt}/mpnst/inputs/seq/PN/30_JS0044CD112818_R1.fastq.gz | head -n $fq_size > ${repo}/test/inputs/plex2_R1.fastq
zcat ${mntpt}/mpnst/inputs/seq/PN/30_JS0044CD112818_R2.fastq.gz | head -n $fq_size > ${repo}/test/inputs/plex2_R2.fastq

for file in ${repo}/test/inputs/*.fastq; do gzip -f $file; done

Get bams for CNA and frag testing

mntpt=/mnt/ris/aadel/Active
singularity shell --bind ${mntpt} ~/sing_containers/biotools.1.0.2.sif

repo=/home/jeszyman/repos/cfdna-wgs
mntpt=/mnt/ris/aadel/Active
mkdir -p ${repo}/test/inputs

# Create small bam files to store in repo. Subsample real bams to ~100 Mb.
sambamba view -s .005 -f bam -t 36 /mnt/ris/aadel/Active/mpnst/test/bam/new_HiSeq15_L002001_ACAC_extract_ds20.bam | samtools sort -@4 -n -o test/inputs/lib003_hg38.bam -

sambamba view -s .005 -f bam -t 36 /mnt/ris/aadel/Active/mpnst/test/bam/new_HiSeq15_L002001_ATCG_extract_ds20.bam > test/bam/lib004_hg38.bam

sambamba view -s 0.01 -f bam -t 4 /mnt/ris/aadel/Active/mpnst/bam/cfdna_wgs/ds/lib105_ds10.bam > test/inputs/lib005.bam

sambamba view -s 0.01 -f bam -t 4 /mnt/ris/aadel/Active/mpnst/bam/cfdna_wgs/ds/lib205_ds10.bam > test/inputs/lib006.bam

for file in test/inputs/*.bam; do samtools index $file; done

Dockerfile

Preamble

FROM jeszyman/biotools:1.0.2

#################
###   Notes   ###
#################
#
# After build, the image will be pushed to the dockerhub as
# jeszyman/cfdna_wgs
# (https://hub.docker.com/repository/docker/jeszyman/cfdna_wgs)

IchorCNA

#
RUN cd /opt && \
    git clone https://github.com/shahcompbio/hmmcopy_utils.git && \
    cd hmmcopy_utils && \
    cmake . && \
    make

#
# ichorCNA
##
## linux dependencies
RUN apt-get update \
   && apt-get install -y \
   libcurl4-openssl-dev \
   libssl-dev \
   libxml2-dev
#RUN rm /usr/lib/x86_64-linux-gnu/libcurl.so.4
#RUN ln -s /usr/lib/x86_64-linux-gnu/libcurl.so.4.5.0 /usr/lib/x86_64-linux-gnu/libcurl.so.4
##
## R dependencies
RUN R -e 'install.packages("BiocManager"); BiocManager::install(); BiocManager::install("HMMcopy"); BiocManager::install("GenomeInfoDb"); BiocManager::install("GenomicRanges");'
##
## git clone install
RUN cd /opt \
    && git clone https://github.com/broadinstitute/ichorCNA.git \
    && cd ichorCNA \
    && R CMD INSTALL . \
    && cd /opt
##

Snakemake configuration YAML

###   Parameters intended to be common across workflows   ###

blklist: "test/inputs/hg38-blacklist.v2.bed"
datadir: "test"
default_container: "/home/jeszyman/sing_containers/biotools.1.0.2.sif"
genome_fasta: "test/inputs/chr8.fa"
genome_ref: "test/ref/chr8.fa"
qcdir: "test/analysis/qc"
threads: 4

###   Unique properties from this repo   ###

cfdna_wgs_container: "/home/jeszyman/sing_containers/cfdna_wgs.1.0.0.sif"

cfdna_wgs_repo: "/home/jeszyman/repos/cfdna-wgs"

downsample:
  - "0.001"
  - "0.004"

frag_distro: "90_150"

gc5mb: "test/inputs/gc5mb.bed"

picard_jar: "/opt/picard/picard.jar"

INPROCESS Integration testing snakefile wrapper

Preamble

##################################################################
###   Integration testing snakefile for WGS cfDNA Processing   ###
##################################################################

import pandas as pd
import re
import numpy as np

Variable naming

# Values directly from configuration file
DOWNSAMPLE = config["downsample"]
threads = config["threads"]
FRAG_DISTROS = config["frag_distro"]
cfdna_wgs_threads = config["threads"]
default_container = config["default_container"]
cfdna_wgs_container = config["cfdna_wgs_container"]
genome_fasta = config["genome_fasta"]
genome_ref = config["genome_ref"]
cfdna_wgs_repo = config["cfdna_wgs_repo"]

# Directory values derived from datadir in configuration YAML
datadir                   = config["datadir"]
cfdna_wgs                 = datadir + "/analysis/cfdna_wgs"
cfdna_wgs_bams            = datadir + "/analysis/cfdna_wgs/bams"
cfdna_wgs_fastqs          = datadir + "/analysis/cfdna_wgs/fastqs"
cfdna_wgs_frag            = datadir + "/analysis/cfdna_wgs/frag"
cfdna_wgs_frag_beds       = datadir + "/analysis/cfdna_wgs/frag/beds"
cfdna_wgs_frag_counts     = datadir + "/analysis/cfdna_wgs/frag/counts"
cfdna_wgs_frag_gc_distros = datadir + "/analysis/cfdna_wgs/frag/distros"
qcdir                     = datadir + "/analysis/qc"
benchdir                  = datadir + "/benchmark"
logdir                    = datadir + "/logs"
refdir                    = datadir + "/ref"

cfdna_wgs_scriptdir = config["cfdna_wgs_repo"] +  "/scripts"

Functions, miscellaneous

###   Functions   ###

# Setup sample name index as a python dictionary
cfdna_wgs_libraries = pd.read_table(config["datadir"] + "/inputs/libraries.tsv")

readable = []
for x in cfdna_wgs_libraries.file:
    readable.append(os.access(x, os.R_OK))
# Ensure readable fastqs
cfdna_wgs_libraries['readable']=readable
cfdna__wgs_libraries = cfdna_wgs_libraries[cfdna_wgs_libraries.readable == True]
# Ensure correct library type per sample sheet
cfdna_wgs_libraries = cfdna_wgs_libraries[cfdna_wgs_libraries.library_type == "wgs"]
cfdna_wgs_libraries = cfdna_wgs_libraries[cfdna_wgs_libraries.isolation_type == "cfdna"]

# Make the dictionary
cfdna_wgs_library_indict = cfdna_wgs_libraries["library"].tolist()
cfdna_wgs_file_indict = cfdna_wgs_libraries["file"].tolist()
cfdna_wgs_lib_dict = dict(zip(cfdna_wgs_library_indict, cfdna_wgs_file_indict))

CFDNA_WGS_LIBRARIES = list(cfdna_wgs_lib_dict.keys())
CFDNA_WGS_FASTQS = list(cfdna_wgs_lib_dict.values())

# Make  a list of healthy libraries
CFDNA_WGS_HEALTHY_LIBRARIES = cfdna_wgs_libraries[cfdna_wgs_libraries['cohort'] == 'healthy']['library'].tolist()

All rule

rule all:
    input:
        logdir + "/aggregate_output",
        cfdna_wgs_frag + "/ratios.tsv",
        qcdir + "/cfdna_wgs_read_qc.tsv",
        qcdir + "/cfdna_wgs_frag_len.tsv",

Benchmark aggregation

onsuccess:
    shell("""
        bash {cfdna_wgs_scriptdir}/agg_bench.sh {benchdir} {qcdir}/agg_bench.tsv
        """)

# For unit testing
indir="test/benchmark"
output="test/analysis/qc/bench_agg.tsv"

if [ -f $output ]; then rm $output; fi

for file in $indir/*
do
    base=$(basename $file)
    cat $file | awk -v OFS='\t' -v var=$base 'NR>1 {print var,$0}' >> $output
done

sed -i '1i\process\tfloat_sec\trun_time\tmax_rss\tmax_vms\tmax_uss\tmax_pss\tio_in\tio_out\tmean_load\tcpu_time' $output

library(tidyverse)

bmk_in = read_tsv("~/repos/cfdna-wgs/test/analysis/qc/bench_agg.tsv")

bmk =
  bmk_in %>%
  mutate(process = gsub(".benchmark.txt", "", process)) %>%
  mutate(library = process) %>%
  mutate(library = ifelse(grepl("lib[0-9]{3}_", process),
                          sub("^.*lib(\\d{3}).*$", "lib\\1", process), "all_libs")) %>%
  mutate(process2 = process) %>%
  mutate(process = gsub("^lib..._","", process)) %>%
  rename(process_lib = process2)

find_outlier <- function(x) {
  return(x > quantile(x, .75) + 1.5*IQR(x))
}

bmk %>% mutate(outlier = ifelse(find_outlier(run_time), process_lib, NA)) %>%
  ggplot(.,aes(y=run_time)) +
  geom_boxplot() +
  geom_text(aes( y = run_time, x = .1,label=outlier), na.rm=TRUE, position = position_jitter())

bmk %>% mutate(outlier = ifelse(find_outlier(run_time), process_lib, NA)) %>%
  ggplot(.,aes( y = run_time)) +
  geom_boxplot() +
  geom_text(aes( y = run_time, x = .1,label=outlier), na.rm=TRUE, position = position_jitter())

Symlink input fastqs

rule symlink_inputs:
    container: default_container,
    input:
        lambda wildcards: cfdna_wgs_lib_dict[wildcards.library],
    output:
        read1 = cfdna_wgs_fastqs + "/{library}_raw_R1.fastq.gz",
        read2 = cfdna_wgs_fastqs + "/{library}_raw_R2.fastq.gz",
    params:
        outdir = cfdna_wgs_fastqs,
        script = cfdna_wgs_scriptdir + "/symlink.sh",
    shell:
        """
        {params.script} \
        {input} \
        {output.read1} \
        {output.read2} \
        {params.outdir}
        """

#!/usr/bin/env bash
set -o errexit   # abort on nonzero exitstatus
set -o nounset   # abort on unbound variable
set -o pipefail  # don't hide errors within pipes

# Script variables
input_read1="${1}"
output_read1="${2}"
output_read2="${3}"
outdir="${4}"

mkdir -p $outdir

input_read2="$(echo $input_read1 | sed "s/_R1/_R2/g")"

ln -sf --relative ${input_read1} ${output_read1}
ln -sf --relative ${input_read2} ${output_read2}

Includes statements

include: cfdna_wgs_repo + "/workflow/reads.smk"
include: cfdna_wgs_repo + "/workflow/frag.smk"

Basic read processing

Preamble

#########1#########2#########3#########4#########5#########6#########7#########8
#                                                                              #
#                    Basic Read and Alignment Processing of                    #
#                    Cell-free DNA Whole Genome Sequencing                     #
#                                                                              #
#########1#########2#########3#########4#########5#########6#########7#########8

Read and alignment processing

Make alignment index

Snakemake

# Make alignment index
#  Note: Upon first run, this rule will touch an empty file with the same path
#        as the index prefix. Thereafter, you can avoid repeat indexing when the
#        rule "sees" this empty file. For repo intergration testing with an
#        external reference, indexing can likewise be avoided with this empty
#        file at the external index location.

rule cfdna_wgs_index:
    benchmark: benchdir + "/cfdna_wgs_index.benchmark.txt",
    container: cfdna_wgs_container,
    input: genome_fasta,
    log: logdir + "/cfdna_wgs_index.log",
    output: done = touch(genome_ref),
    params:
        out_prefix = genome_ref,
        script = cfdna_wgs_scriptdir + "/index.sh",
        threads = cfdna_wgs_threads,
    shell:
        """
        bwa index -p {params.out_prefix} {input} &> {log}
        """

Adapter-trim and QC reads with fastp

Snakemake

# Adapter-trim and QC reads with fastp
rule cfdna_wgs_fastp:
    benchmark: benchdir + "/{library}_cfdna_wgs_fastp.benchmark.txt",
    container: cfdna_wgs_container,
    input:
        read1 = cfdna_wgs_fastqs + "/{library}_raw_R1.fastq.gz",
        read2 = cfdna_wgs_fastqs + "/{library}_raw_R2.fastq.gz",
    log:
        cmd = logdir + "/{library}_cfdna_wgs_fastp.log",
        html = logdir + "/{library}_cfdna_wgs_fastp.html",
        json = logdir + "/{library}_cfdna_wgs_fastp.json",
    output:
        read1 = cfdna_wgs_fastqs + "/{library}_processed_R1.fastq.gz",
        read2 = cfdna_wgs_fastqs + "/{library}_processed_R2.fastq.gz",
        failed = cfdna_wgs_fastqs + "/{library}_failed_fastp.fastq.gz",
        unpaired1 = cfdna_wgs_fastqs + "/{library}_unpaired_R1.fastq.gz",
        unpaired2 = cfdna_wgs_fastqs + "/{library}_unpaired_R2.fastq.gz",
    params:
        script = cfdna_wgs_scriptdir + "/fastp.sh",
        threads = cfdna_wgs_threads,
    resources:
        mem_mb = 500,
    shell:
        """
        {params.script} \
        {input.read1} \
        {input.read2} \
        {log.html} \
        {log.json} \
        {output.read1} \
        {output.read2} \
        {output.failed} \
        {output.unpaired1} \
        {output.unpaired2} \
        {params.threads} &> {log.cmd}
        """

Shell script

#!/usr/bin/env bash
set -o errexit   # abort on nonzero exitstatus
set -o nounset   # abort on unbound variable
set -o pipefail  # don't hide errors within pipes

# Script variables

input_read1="${1}"
input_read2="${2}"
log_html="${3}"
log_json="${4}"
output_read1="${5}"
output_read2="${6}"
output_failed="${7}"
output_unpaired1="${8}"
output_unpaired2="${9}"
params_threads="${10}"

# Functions
main(){
    fastp_wrap $output_failed \
               $input_read1 \
               $input_read2 \
               $log_html \
               $log_json \
               $output_read1 \
               $output_read2 \
               $output_unpaired1 \
               $output_unpaired2 \
               $params_threads
}

fastp_wrap(){
    fastp --detect_adapter_for_pe \
          --failed_out $output_failed \
          --in1 $input_read1 \
          --in2 $input_read2 \
          --html $log_html \
          --json $log_json \
          --out1 $output_read1 \
          --out2 $output_read2 \
          --unpaired1 $output_unpaired1 \
          --unpaired2 $output_unpaired2 \
          --thread $params_threads
    }

# Run
main "$@"

Align reads with BWA

Snakemake

# Align reads with BWA
rule cfdna_wgs_align:
    benchmark: benchdir + "/{library}_cfdna_wgs_align.benchmark.txt",
    container: cfdna_wgs_container,
    input:
        ref = genome_ref,
        read1 = cfdna_wgs_fastqs + "/{library}_processed_R1.fastq.gz",
        read2 = cfdna_wgs_fastqs + "/{library}_processed_R2.fastq.gz",
    log: logdir + "/{library}_cfdna_wgs_align.log",
    output:
        sort = cfdna_wgs_bams + "/{library}_raw.bam",
        index = cfdna_wgs_bams + "/{library}_raw.bam.bai",
    params:
        script = cfdna_wgs_scriptdir + "/align.sh",
        threads = 4,
    resources:
        mem_mb = 500,
    shell:
        """
        {params.script} \
        {input.ref} \
        {input.read1} \
        {input.read2} \
        {params.threads} \
        {output.sort} &> {log}
        """

Shell script

#!/usr/bin/env bash
input_ref=$1
input_r1=$2
input_r2=$3
threads=$4
output_sort=$5

bwa mem -M -t $threads \
    $input_ref \
    $input_r1 \
    $input_r2 |
    samtools view -@ $threads -Sb - -o - |
    samtools sort -@ $threads - -o $output_sort
samtools index -@ threads $output_sort

Remove PCR duplicates

Snakemake

# Remove PCR duplicates from aligned reads
rule cfdna_wgs_dedup:
    benchmark: benchdir + "/{library}_cfdna_wgs_dedup.benchmark.txt",
    container: cfdna_wgs_container,
    input: cfdna_wgs_bams + "/{library}_raw.bam",
    log: logdir + "/{library}_cfdna_wgs_dedup.log",
    output: cfdna_wgs_bams + "/{library}_dedup.bam",
    params:
        script = cfdna_wgs_scriptdir + "/dedup.sh",
        threads = cfdna_wgs_threads,
    shell:
        """
        {params.script} \
        {input} \
        {output} \
        {params.threads} &> {log}
        """

Shell script

#!/usr/bin/env bash
set -o errexit   # abort on nonzero exitstatus
set -o nounset   # abort on unbound variable
set -o pipefail  # don't hide errors within pipes

# Script variables
raw_bam="${1}"
dedup_bam="${2}"
threads="${3}"

samtools sort -@ $threads -n -o - $raw_bam |
    samtools fixmate -m - - |
    samtools sort -@ $threads -o - - |
    samtools markdup -@ $threads -r - $dedup_bam
samtools index $dedup_bam

Filter de-duplicated alignments

Snakemake

# Filter de-duplicated alignments.
# Remove unmapped, not primary, and duplicate reads. Additional location filter by config bedfile variable.

checkpoint cfdna_wgs_filter_alignment:
    benchmark: benchdir + "/{library}_cfdna_wgs_filter_alignment.benchmark.txt",
    container: cfdna_wgs_container,
    input: cfdna_wgs_bams + "/{library}_dedup.bam",
    log: logdir + "/{library}_cfdna_wgs_filter_alignment.log",
    output: cfdna_wgs_bams + "/{library}_filt.bam",
    params:
        script = cfdna_wgs_scriptdir + "/filter_alignment.sh",
        threads = cfdna_wgs_threads,
    shell:
        """
        {params.script} \
        {input} \
        {params.threads} \
        {output} &> {log}
        """

Shell script

#!/usr/bin/env bash

input=$1
threads=$2
output=$3

# Filter to reads that are
#  - Excluding any unmapped, not primary alignment, or duplicates
#  - Only MAPQ > 20
# DO NOT restrict to "proper pairs"- this clips long cfDNA fragments!

samtools view -@ $threads -b -F 1284 -h -q 20 -o $output $input

samtools index ${output}

Read and alignment QC

FastQC

Snakemake

# Get read quality by FASTQC
rule cfdna_wgs_fastqc:
    benchmark: benchdir+ "/{library}_{processing}_{read}_cfdna_wgs_fastqc.benchmark.txt",
    container: cfdna_wgs_container,
    input: cfdna_wgs_fastqs + "/{library}_{processing}_{read}.fastq.gz",
    log: logdir + "/{library}_{processing}_{read}_cfdna_wgs_fastqc.log",
    output:
        qcdir + "/{library}_{processing}_{read}_fastqc.html",
        qcdir + "/{library}_{processing}_{read}_fastqc.zip",
    params:
        outdir = qcdir,
        script = cfdna_wgs_scriptdir + "/fastqc.sh",
        threads = cfdna_wgs_threads,
    shell:
        """
        {params.script} \
        {input} \
        {params.outdir} \
        {params.threads} &> {log}
        """

Shell script

#!/usr/bin/env bash
set -o errexit   # abort on nonzero exitstatus
set -o nounset   # abort on unbound variable
set -o pipefail  # don't hide errors within pipes

# Script variables
input="${1}"
outdir="${2}"
threads="${3}"

# Functions
fastqc  --outdir $outdir \
        --quiet \
        --threads $threads $input

Alignment QC

Snakemake

# Get alignment QC using samtools
rule cfdna_wgs_alignment_qc:
    container: cfdna_wgs_container,
    input: cfdna_wgs_bams + "/{library}_{processing}.bam",
    log:
        flagstat = logdir + "/{library}_{processing}_flagstat_cfdna_wgs_alignment_qc.log",
        samstat = logdir + "/{library}_{processing}_samstats_cfdna_wgs_alignment_qc.log",
    output:
        flagstat = qcdir + "/{library}_{processing}_flagstat.txt",
        samstat = qcdir + "/{library}_{processing}_samstats.txt",
    params:
        script = cfdna_wgs_scriptdir + "/alignment_qc.sh",
        threads = cfdna_wgs_threads,
    shell:
        """
        {params.script} \
        {input} \
        {log.flagstat} \
        {log.samstat} \
        {output.flagstat} \
        {output.samstat} \
        {params.threads}
        """

Shell script

#!/usr/bin/env bash
set -o errexit   # abort on nonzero exitstatus
set -o nounset   # abort on unbound variable
set -o pipefail  # don't hide errors within pipes

# Script variables
input="${1}"
log_flagstat="${2}"
log_samstat="${3}"
output_flagstat="${4}"
output_samstat="${5}"
threads="${6}"

# Functions
main(){
    flagstat $input $output_flagstat $log_flagstat $threads
    samstats $input $output_samstat $log_samstat $threads
}

flagstat(){
    local input="${1}"
    local output="${2}"
    local log="${3}"
    local threads="${4}"
    #
    samtools flagstat -@ $threads $input > $output 2>$log
}

samstats(){
    local input="${1}"
    local output="${2}"
    local log="${3}"
    local threads="${4}"
    #
    samtools stats -@ $threads $input > $output 2>$log
}

# Run
main "$@"

Sequencing depth metrics via Picard

Snakemake

# Sequencing depth metrics via Picard
rule cfdna_wgs_picard_depth:
    benchmark: benchdir + "/{library}_cfdna_wgs_picard_depth.benchmark.txt",
    container: cfdna_wgs_container,
    input: cfdna_wgs_bams + "/{library}_filt.bam",
    log: logdir + "/{library}_cfdna_wgs_picard_depth.log",
    output: qcdir + "/{library}_picard_depth.txt",
    params:
        script = cfdna_wgs_scriptdir + "/picard_depth.sh",
        threads = cfdna_wgs_threads,
    shell:
        """
        {params.script} \
        {input} \
        {config[picard_jar]} \
        {config[genome_fasta]} \
        {output}
        """

Shell script

#!/usr/bin/env bash
input=$1
picard_jar=$2
genome_fasta=$3
output=$4

java -jar $picard_jar CollectWgsMetrics \
       INPUT=$input \
       OUTPUT=$output \
       READ_LENGTH=150 \
       REFERENCE_SEQUENCE=$genome_fasta

deepTools fragment sizes

Snakemake

# Get fragment sizes using deepTools
rule cfdna_wgs_bampefragsize:
    benchmark: benchdir + "/cfdna_wgs_bampefragsize.benchmark.txt",
    container: cfdna_wgs_container,
    input: expand(cfdna_wgs_bams + "/{library}_filt.bam", library = CFDNA_WGS_LIBRARIES),
    log: logdir + "/cfdna_wgs_bampefragsize.log",
    output:
        raw = qcdir + "/deeptools_frag_lengths.txt",
        hist = qcdir + "/deeptools_frag_lengths.png",
    params:
        blacklist = config["blklist"],
        script = cfdna_wgs_scriptdir + "/bampefragsize.sh",
        threads = cfdna_wgs_threads,
    shell:
        """
        {params.script} \
        "{input}" \
        {log} \
        {output.hist} \
        {output.raw} \
        {params.blacklist} \
        {params.threads}
        """

Shell script

#!/usr/bin/env bash
#!/usr/bin/env bash
set -o errexit   # abort on nonzero exitstatus
set -o nounset   # abort on unbound variable
set -o pipefail  # don't hide errors within pipes

# Script variables

input="${1}"
log="${2}"
output_hist="${3}"
output_raw="${4}"
blacklist="${5}"
threads="${6}"


bamPEFragmentSize --bamfiles $input \
                  --numberOfProcessors $threads \
                  --blackListFileName $blacklist \
                  --histogram $output_hist \
                  --maxFragmentLength 1000 \
                  --outRawFragmentLengths $output_raw

deepTools bamCoverage

Snakemake

# Make deeptools bamCoverage bedfile
rule cfdna_wgs_bamcoverage:
    benchmark: benchdir + "/{library}_cfdna_wgs_bamcoverage.benchmark.txt",
    container: cfdna_wgs_container,
    input: cfdna_wgs_bams + "/{library}_filt.bam",
    log: logdir + "/{library}_cfdna_wgs_bamcoverage.log",
    output: qcdir + "/{library}_bamcoverage.bg",
    params:
        bin = "10000",
        blacklist = config["blklist"],
        script = cfdna_wgs_scriptdir + "/bamcoverage.sh",
        threads = cfdna_wgs_threads,
    shell:
        """
        {params.script} \
        {input} \
        {output} \
        {params.bin} \
        {params.blacklist} \
        {params.threads} &> {log}
        """

Shell script

#!/usr/bin/env bash

in_bam=$1
bin=$3
blacklist=$4
threads=$5
out_bg=$2

bamCoverage \
    --bam $in_bam \
    --binSize $bin \
    --blackListFileName $blacklist \
    --effectiveGenomeSize 2913022398 \
    --extendReads \
    --ignoreDuplicates \
    --ignoreForNormalization chrX \
    --normalizeUsing RPGC \
    --numberOfProcessors $threads \
    --outFileFormat bedgraph \
    --outFileName $out_bg

deepTools plotCoverage

Snakemake

# Make deepTools plotCoverage coverage maps for all filtered bams
rule cfdna_wgs_plotcoverage:
    benchmark: benchdir + "/cfdna_wgs_plotcoverage.benchmark.txt",
    container: cfdna_wgs_container,
    input: expand(cfdna_wgs_bams + "/{library}_filt.bam", library = CFDNA_WGS_LIBRARIES),
    log: logdir + "/cfdna_wgs_plotcoverage.log",
    output:
        raw = qcdir + "/cfdna_wgs_coverage.tsv",
        plot = qcdir + "/cfdna_wgs_coverage.pdf",
    params:
        blacklist = config["blklist"],
        script = cfdna_wgs_scriptdir + "/plotcoverage.sh",
        threads = cfdna_wgs_threads,
    shell:
        """
        {params.script} \
        "{input}" \
        {params.blacklist} \
        {params.threads} \
        {output.raw} \
        {output.plot} &> {log}
        """

Shell script

#!/usr/bin/env bash
in_bam_string=$1
blacklist=$2
threads=$3
out_raw=$4
out_plot=$5

plotCoverage \
    --bamfiles $in_bam_string \
    --blackListFileName $blacklist \
    --extendReads \
    --numberOfProcessors $threads \
    --outRawCounts $out_raw \
    --plotFile $out_plot \
    --plotFileFormat pdf \
    --skipZeros

MultiQC

Snakemake

# Aggregate QC files using MultiQC
rule cfdna_wgs_multiqc:
    benchmark: benchdir + "/cfdna_wgs_multiqc.benchmark.txt",
    container: cfdna_wgs_container,
    input:
        expand(logdir + "/{library}_cfdna_wgs_fastp.json", library = CFDNA_WGS_LIBRARIES),
        expand(qcdir + "/{library}_{processing}_{read}_fastqc.zip", library = CFDNA_WGS_LIBRARIES, processing = ["raw", "processed", "unpaired"], read = ["R1","R2"]),
        expand(qcdir + "/{library}_{processing}_samstats.txt", library = CFDNA_WGS_LIBRARIES, processing = ["raw","filt"]),
        expand(qcdir + "/{library}_{processing}_flagstat.txt", library = CFDNA_WGS_LIBRARIES, processing = ["raw","filt"]),
        expand(qcdir + "/{library}_picard_depth.txt", library = CFDNA_WGS_LIBRARIES),
        qcdir + "/deeptools_frag_lengths.txt",
        qcdir + "/cfdna_wgs_coverage.tsv",
    log: logdir + "/cfdna_wgs_multiqc.log",
    output:
        qcdir + "/cfdna_wgs_multiqc.html",
        qcdir + "/cfdna_wgs_multiqc_data/multiqc_fastqc.txt",
        qcdir + "/cfdna_wgs_multiqc_data/multiqc_samtools_stats.txt",
        qcdir + "/cfdna_wgs_multiqc_data/multiqc_picard_wgsmetrics.txt",
        qcdir + "/cfdna_wgs_multiqc_data/multiqc_samtools_flagstat.txt",
    params:
        out_dir = qcdir,
        out_name = "cfdna_wgs_multiqc",
        script = cfdna_wgs_scriptdir + "/multiqc.sh",
        threads = cfdna_wgs_threads,
    shell:
        """
        {params.script} \
        "{input}" \
        {params.out_name} \
        {params.out_dir} &> {log}
        """

Shell script

#!/usr/bin/env bash
set -o errexit   # abort on nonzero exitstatus
set -o nounset   # abort on unbound variable
set -o pipefail  # don't hide errors within pipes

# Script variables

   input="${1}"
out_name="${2}"
 out_dir="${3}"

# Functions

multiqc $input \
        --force \
        --outdir $out_dir \
        --filename $out_name

Make aggregate QC table

Snakemake

# Make a tab-separated aggregate QC table
checkpoint cfdna_wgs_make_qc_tsv:
    benchmark: benchdir + "/cfdna_wgs_make_qc_tsv.benchmark.txt",
    container: cfdna_wgs_container,
    input:
        fq = qcdir + "/cfdna_wgs_multiqc_data/multiqc_fastqc.txt",
        mqsam = qcdir + "/cfdna_wgs_multiqc_data/multiqc_samtools_stats.txt",
        mqflag = qcdir + "/cfdna_wgs_multiqc_data/multiqc_samtools_flagstat.txt",
        picard = qcdir + "/cfdna_wgs_multiqc_data/multiqc_picard_wgsmetrics.txt",
        deeptools_frag = qcdir + "/deeptools_frag_lengths.txt",
        deeptools_cov = qcdir + "/cfdna_wgs_coverage.tsv",
    log: logdir + "/cfdna_wgs_make_qc_tsv.log",
    output:
        readqc = qcdir + "/cfdna_wgs_read_qc.tsv",
        fraglen = qcdir + "/cfdna_wgs_frag_len.tsv",
    params:
        script = cfdna_wgs_scriptdir + "/make_qc_tsv.R",
    shell:
        """
        Rscript {params.script} \
        {input.fq} \
        {input.mqsam} \
        {input.mqflag} \
        {input.picard} \
        {input.deeptools_frag} \
        {input.deeptools_cov} \
        {output.readqc} \
        {output.fraglen} >& {log}
        """

Rscript

#!/usr/bin/env Rscript
#
# Unit test variables
## mqc_dir="test/analysis/qc/cfdna_wgs_multiqc_data"
## fastqc_input = paste0(mqc_dir,"/multiqc_fastqc.txt")
## samstats_input = paste0(mqc_dir, "/multiqc_samtools_stats.txt")
## flagstats_input = paste0(mqc_dir, "/multiqc_samtools_flagstat.txt")
## picard_input = paste0(mqc_dir, "/multiqc_picard_wgsmetrics.txt")
## deeptools_frag_input = "test/analysis/qc/deeptools_frag_lengths.txt"
## deeptools_cov_input = "test/analysis/qc/cfdna_wgs_coverage.tsv"

args = commandArgs(trailingOnly = TRUE)
fastqc_input = args[1]
samstats_input = args[2]
flagstats_input = args[3]
picard_input = args[4]
deeptools_frag_input = args[5]
deeptools_cov_input = args[6]
readqc_out_tbl = args[7]
frag_len_out_tbl = args[8]

library(tidyverse)

process_multiqc_fastqc = function(multiqc_fastqc_input){
  as_tibble(read.table(multiqc_fastqc_input, header = TRUE, sep = '\t', stringsAsFactors = FALSE)) %>%
  mutate(library = substr(Filename,1,6)) %>%
  mutate(read = ifelse(grepl("R1", Filename), "read1", "read2")) %>%
  mutate(fastq_processing = gsub("_.*$","",substr(Sample, 8, length(Sample)))) %>%
  select(!c(Sample,File.type,Encoding)) %>%
  pivot_wider(
    names_from = c(read,fastq_processing),
    values_from = !c(library,read,fastq_processing))
}

fastqc = process_multiqc_fastqc(fastqc_input)
  as_tibble(read.table(fastqc_input, header = TRUE, sep = '\t', stringsAsFactors = FALSE)) %>%
  mutate(library = substr(Sample, 1, 6)) %>%
  mutate(bam_processing = gsub("_.*$","",substr(Sample, 8, length(Sample)))) %>%
  select(!c(Sample)) %>%
  pivot_wider(
    names_from = c(bam_processing),
    values_from = !c(library, bam_processing))

process_multiqc_samfile = function(multiqc_samfile){
  read_tsv(multiqc_samfile) %>% mutate(library = substr(Sample, 1, 6)) %>%
  mutate(bam_processing = gsub("_.*$","",gsub("lib..._","", Sample))) %>%
  select(!c(Sample)) %>%
  pivot_wider(
    names_from = c(bam_processing),
    values_from = !c(library, bam_processing))
}

samstats = process_multiqc_samfile(samstats_input)
flagstats = process_multiqc_samfile(flagstats_input)

deeptools_frag = read_tsv(deeptools_frag_input, col_names = c("frag_len","frag_count","file"), skip = 1) %>%
  filter(frag_len < 500) %>%
  mutate(library = substr(gsub("^.*lib", "lib", file), 1,6)) %>%
  mutate(frag_len = sub("^", "frag_len", frag_len)) %>%
  select(library, frag_len, frag_count) %>%
  pivot_wider(
    names_from = frag_len,
    values_from = frag_count)

picard = as_tibble(read.table(picard_input, header = TRUE, sep = '\t', stringsAsFactors = FALSE)) %>%
  mutate(library = Sample)

deeptools_cov = read_tsv(deeptools_cov_input, skip = 1) %>%
  pivot_longer(!c(`#'chr'`, `'start'`,`'end'`), names_to = "file", values_to = "cnt") %>%
  rename(chr = `#'chr'`,
         start = `'start'`,
         end = `'end'`) %>%
  mutate(library = substr(file, 2, 7)) %>%
  group_by(library) %>%
  summarise(
    mean_cov = mean(cnt),
    median_cov = median(cnt),
            )

readqc = fastqc %>%
  left_join(samstats, by = "library") %>%
  left_join(flagstats, by = "library") %>%
  left_join(deeptools_frag, by = "library") %>%
  left_join(picard, by = "library") %>%
  left_join(deeptools_cov, by = "library")

write.table(readqc, file = readqc_out_tbl, row.names = F, sep = '\t', quote = F)

all_frag_len = data.frame(frag_len = 1:500)

frag_len =
  readqc %>% select(starts_with("frag_len") | matches("library")) %>%
  pivot_longer(!library, names_to = "frag_len", values_to = "count") %>%
  mutate(frag_len = as.numeric(gsub("frag_len","",frag_len))) %>%
  mutate(count = as.numeric(count)) %>%
  pivot_wider(names_from = library, values_from = count) %>%
  right_join(all_frag_len) %>% arrange(frag_len) %>%
  replace(is.na(.), 0)

write_tsv(frag_len, file = frag_len_out_tbl)

Downsample bams

Snakemake

rule downsample_bams:
    container: cfdna_wgs_container,
    input: cfdna_wgs_bams + "/{library}_filt.bam",
    output: touch(logdir + "/{library}_{downsample}_downsample.done"),
    params:
        out_dir = cfdna_wgs_bams,
        script = cfdna_wgs_scriptdir + "/downsample_bams.sh",
        suffix = "_filt.bam",
        threads = cfdna_wgs_threads,
    shell:
        """
        {params.script} \
        {input} \
        {wildcards.downsample} \
        {params.out_dir} \
        {params.suffix} \
        {params.threads}
        """

Shell script

#!/usr/bin/env bash
# For unit testing
# in_bam=test/analysis/cfdna_wgs/bams/lib001_filt.bam
# milreads=0.001
# checker=test/tmp/lib001_ds0.001.txt
# outdir=test/analysis/cfdna_wgs/bams
# suffix=_filt.bam
# threads=4

in_bam="${1}"
milreads="${2}"
outdir="${3}"
suffix="${4}"
threads="${5}"

downsample(){
    # Derived variables
    milreads_full=$(awk -v milreads="${milreads}" 'BEGIN{milreads_full=(1000000*milreads); print milreads_full}')
    factor=$(samtools idxstats $in_bam |
                 cut -f3 |
                 awk -v count=$milreads_full 'BEGIN {total=0} {total += $1} END {print count/total}')
    base=$(basename -s $suffix $in_bam)
    out_bam=${outdir}/${base}_ds${milreads}.bam
    #
    # Downsample
    if [[ $factor < 1 ]]; then
    samtools view -s $factor -b -@ $threads $in_bam > $out_bam
    fi
}

downsample $in_bam $milreads $suffix

Setup conditional execution of downsampled bams

# If downsample occured, then write filename into this per-library log, else leave the log file blank
rule log_dowsample:
    input: logdir + "/{library}_{downsample}_downsample.done",
    output: logdir + "/{library}_{downsample}_made",
    params:
        bamdir = cfdna_wgs_bams,
    shell:
        """
        dspath={params.bamdir}/{wildcards.library}_ds{wildcards.downsample}.bam
        if [ -f $dspath ]; then echo "$dspath"  > {output}; else touch {output}; fi
        """

# Use the downsampled bam logs to make a single text file of conditionally executed final targets.
# Specifically in this example, log text lines are in the form
# cfdna_wgs_bams + "/{library}_ds{downsample}_frag90_150.bam" to setup conditional execution of fragment filtering ONLY on downsampled bams
# Note alternative delimiter "~" to sed allows cfdna_wgs_wigs as param

checkpoint ds_cond_target_list:
    input: expand(logdir + "/{library}_{downsample}_made", library = CFDNA_WGS_LIBRARIES, downsample = DOWNSAMPLE),
    output: logdir + "/ds_final_targets",
    params:
        outdir = cfdna_wgs_bams,
        frag_distro=config["frag_distro"]
    shell:
        """
        if [ -f {output} ]; then rm {output}; fi
        cat {input} > {output}
        sed -i 's~^.*lib~{params.outdir}/lib~g' {output}
        sed -i 's/.bam$/_frag{params.frag_distro}.bam/g' {output}
        """

# Function jsut pulls the final target names out of ds_final_targets
def get_ds_targets(wildcards):
    with open(checkpoints.ds_cond_target_list.get(**wildcards).output[0], "r") as f:
      non_empty_files = [l.strip() for l in f.readlines()]
    return non_empty_files

# This rule allows execution of rules which will generate the conditional targets in ds_cond_target_list
rule make_ds_targets:
    input:
        get_ds_targets
    output: logdir + "/aggregate_output"
    run:
        with open(output[0], "w") as f:
            f.write("\n".join(input))

Filter downsampled bams to set fragment length distributions

rule frag_filt:
    container: cfdna_wgs_container,
    input:
        main = cfdna_wgs_bams + "/{library}_ds{downsample}.bam",
        check = logdir + "/{library}_{downsample}_made",
    output:
        nohead = temp(cfdna_wgs_bams + "/{library}_ds{downsample}_frag{frag_distro}.nohead"),
        onlyhead = temp(cfdna_wgs_bams + "/{library}_ds{downsample}_frag{frag_distro}.only"),
        final = cfdna_wgs_bams + "/{library}_ds{downsample}_frag{frag_distro}.bam",
    params:
        script = cfdna_wgs_scriptdir + "/frag_filt.sh",
        threads = cfdna_wgs_threads,
    shell:
        """
        frag_min=$(echo {wildcards.frag_distro} | sed -e "s/_.*$//g")
        frag_max=$(echo {wildcards.frag_distro} | sed -e "s/^.*_//g")
        {params.script} \
        {input.main} \
        {output.nohead} \
        $frag_min \
        $frag_max \
        {config[threads]} \
        {output.onlyhead} \
        {output.final}
        """

Shell script

#!/usr/bin/env bash

# Steps
## Filter by absolute value of TLEN for each read
sambamba view -t $5 $1 | awk -F'\t' -v upper="$4" 'sqrt($9*$9) < upper {print $0}' | awk -F'\t' -v lower="$3" 'sqrt($9*$9) > lower {print $0}'> $2

## Restore header
sambamba view -H $1 > $6

cat $6 $2 | sambamba view -t 4 -S -f bam /dev/stdin | sambamba sort -t 4 -o $7 /dev/stdin

Fragmentomics

Preamble

#########1#########2#########3#########4#########5#########6#########7#########8
#                                                                              #
#     Fragmentomic Analysis of Cell-free DNA Whole Genome Sequencing           #
#                                                                              #
#########1#########2#########3#########4#########5#########6#########7#########8

Make GC and mappability restricted bins

Snakemake

rule make_gc_map_bind:
    container: cfdna_wgs_container,
    input:
        gc5mb = config["gc5mb"],
        blklist = config["blklist"],
    log: logdir + "/make_gc_map_bind.log",
    output: refdir + "/keep_5mb.bed",
    params:
        script = cfdna_wgs_scriptdir + "/make_gc_map_bind.sh",
    shell:
        """
        {params.script} \
        {input.gc5mb} \
        {input.blklist} \
        {output} &> {log}
        """

Shell script

gc5mb="${1}"
blklist="${2}"
keep="${3}"

bedtools intersect -a $gc5mb -b $blklist -v -wa |
    grep -v _ |
    awk '{ if ($4 >= 0.3) print $0 }' > $keep

Make bedfile from filtered bam

error may be multimappers https://www.biostars.org/p/55149/
- https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment

Snakemake

# Make a bed file from filtered bam
rule filt_bam_to_frag_bed:
    benchmark: benchdir + "/{library}_filt_bam_to_frag_bed.benchmark.txt",
    container: cfdna_wgs_container,
    input: cfdna_wgs_bams + "/{library}_filt.bam",
    log: logdir + "/{library}_filt_bam_to_frag_bed.log",
    output: cfdna_wgs_frag_beds + "/{library}_filt.bed",
    params:
        fasta = genome_fasta,
        script = cfdna_wgs_scriptdir + "/filt_bam_to_frag_bed.sh",
        threads = cfdna_wgs_threads,
    shell:
        """
        {params.script} \
	{input} \
        {params.fasta} \
        {params.threads} \
        {output}
        """

Shell script

#!/usr/bin/env bash

# Snakemake variables
input_bam="$1"
params_fasta="$2"
threads="${3}"
output_frag_bed="$4"

# Function
bam_to_frag(){
    # Ensure name-sorted bam file
    samtools sort -@ $threads -n -o - $1 |
    samtools fixmate -@ $threads -m -r - - |
    # Make bedpe
    bedtools bamtobed -bedpe -i - |
    # Filter any potential non-standard alignments
    awk '$1==$4 {print $0}' | awk '$2 < $6 {print $0}' |
    # Create full-fragment bed file
    awk -v OFS='\t' '{print $1,$2,$6}' |
    # Annotate with GC content and fragment length
    bedtools nuc -fi $2 -bed stdin |
    # Convert back to standard bed with additional columns
    awk -v OFS='\t' '{print $1,$2,$3,$5,$12}' |
    sed '1d' > $3
}

# Run command
bam_to_frag $input_bam \
            $params_fasta \
            $output_frag_bed

Make GC distributions

Snakemake

# Make GC distributions
rule gc_distro:
    benchmark: benchdir + "/{library}_cfdna_wgs_gc_distro.benchmark.txt",
    container: cfdna_wgs_container,
    input: cfdna_wgs_frag_beds + "/{library}_filt.bed",
    log: logdir + "/{library}_cfdna_wgs_gc_distro.log",
    output: cfdna_wgs_frag_gc_distros + "/{library}_gc_distro.csv",
    params:
        script = cfdna_wgs_scriptdir + "/gc_distro.R",
    shell:
        """
        Rscript {params.script} \
        {input} \
        {output} \
        > {log} 2>&1
        """

Rscript

#!/usr/bin/env Rscript
args = commandArgs(trailingOnly = TRUE)
bed_file = args[1]
distro_file = args[2]

library(tidyverse)

# Read in modified bed
bed = read.table(bed_file, sep = '\t')
names(bed) = c("chr","start","end","gc_raw","len")

# Generate distribution csv
distro =
  bed %>%
  # Round GC
  mutate(gc_strata = round(gc_raw, 2)) %>%
  # Count frags per strata
  count(gc_strata) %>%
  # Get fraction frags
  mutate(fract_frags = n/sum(n)) %>% mutate(library_id = gsub("_frag.bed", "", gsub("^.*lib", "lib", bed_file))) %>%
  select(library_id,gc_strata,fract_frags) %>%
  write.csv(file = distro_file, row.names = F)

Make healthy GC distributions summary file

Snakemake

# Make healthy GC distributions summary file
rule healthy_gc:
    benchmark: benchdir + "/cfdna_wgs_healthy_gc.benchmark.txt",
    container: cfdna_wgs_container,
    input: expand(cfdna_wgs_frag_gc_distros + "/{library}_gc_distro.csv", library = CFDNA_WGS_HEALTHY_LIBRARIES),
    log: logdir + "/cfdna_wgs_healthy_gc.log",
    output: cfdna_wgs_frag_gc_distros + "/healthy_med.rds",
    params:
        distro_dir = cfdna_wgs_frag_gc_distros,
        script = cfdna_wgs_scriptdir + "/healthy_gc.R",
    shell:
        """
        Rscript {params.script} \
        {params.distro_dir} \
        "{input}" \
        {output} > {log} 2>&1
        """

Rscript

#!/usr/bin/env Rscript
args = commandArgs(trailingOnly = TRUE)
distro_dir = args[1]
healthy_libs_str = args[2]
healthy_med_file = args[3]

library(tidyverse)

healthy_libs_distros = unlist(strsplit(healthy_libs_str, " "))

read_in_gc = function(gc_csv){
  read.csv(gc_csv, header = T)
}

healthy_list = lapply(healthy_libs_distros, read_in_gc)

# Bind
healthy_all = do.call(rbind, healthy_list)

# Summarize
healthy_med =
  healthy_all %>%
  group_by(gc_strata) %>%
  summarise(med_frag_fract = median(fract_frags))

# Save
saveRDS(healthy_med, file = healthy_med_file)

Sample fragments by healthy GC proportions

Snakemake

# Sample fragments by healthy GC proportions
rule cfdna_wgs_gc_sample:
    benchmark: benchdir + "/{library}_cfdna_wgs_gc_sample.benchmark.txt",
    container: cfdna_wgs_container,
    input:
        frag_bed = cfdna_wgs_frag_beds + "/{library}_filt.bed",
        healthy_med = cfdna_wgs_frag_gc_distros + "/healthy_med.rds",
    log: logdir + "/{library}_cfdna_wgs_gc_sample.log",
    output: cfdna_wgs_frag_beds + "/{library}_sampled_frag.bed",
    params:
        script = cfdna_wgs_scriptdir + "/gc_sample.R",
    shell:
        """
        Rscript {params.script} \
        {input.healthy_med} \
        {input.frag_bed} \
        {output} > {log} 2>&1
        """

Rscript

#!/usr/bin/env Rscript
args = commandArgs(trailingOnly = TRUE)
healthy_med = args[1]
frag_file = args[2]
sampled_file = args[3]

library(tidyverse)

healthy_fract = readRDS(healthy_med)
frag_file = read.table(frag_file, sep = '\t', header = F)

frag_bed = frag_file
names(frag_bed) = c("chr", "start", "end", "gc_raw", "len")

frag = frag_bed %>%
  # Round off the GC strata
  mutate(gc_strata = round(gc_raw, 2)) %>%
  # Join the median count of fragments per strata in healthies
  # Use this later as sampling weight
  left_join(healthy_fract, by = "gc_strata")

# Determine frags to sample by counts in strata for which healthies had highest count
stratatotake = frag$gc_strata[which.max(frag$med_frag_fract)]
fragsinmaxstrata = length(which(frag$gc_strata == stratatotake))
fragstotake = round(fragsinmaxstrata/stratatotake)

sampled = frag %>%
  filter(!is.na(med_frag_fract)) %>%
  slice_sample(., n = nrow(.), weight_by = med_frag_fract, replace = T) %>% select(chr, start, end, len, gc_strata)

write.table(sampled, sep = "\t", col.names = F, row.names = F, quote = F, file = sampled_file)

Sum fragments in genomic windows by length

Snakemake

# Sum fragments in short and long length groups

rule frag_sum:
    benchmark: benchdir + "/{library}_frag_sum.benchmark.txt",
    container: cfdna_wgs_container,
    input: cfdna_wgs_frag_beds + "/{library}_sampled_frag.bed",
    log: logdir + "/{library}_cfdna_wgs_frag_window_sum.log",
    output:
        short = cfdna_wgs_frag_beds + "/{library}_norm_short.bed",
        long =  cfdna_wgs_frag_beds + "/{library}_norm_long.bed",
    params:
        script = cfdna_wgs_scriptdir + "/frag_window_sum.sh",
        threads = cfdna_wgs_threads,
    shell:
        """
        {params.script} \
        {input} \
        {output.short} {output.long} &> {log}
        """

Shell script

#!/usr/bin/env bash
input_frag="$1"
output_short="$2"
output_long="$3"

# Functions
make_short(){
    cat $1 | awk '{if ($4 >= 100 && $5 <= 150) print $0}' > $2
}

make_long(){
    cat $1 | awk '{if ($4 >= 151 && $5 <= 220) print $0}' > $2
}

# Run command
make_short $input_frag $output_short
make_long $input_frag $output_long

Count fragments intersecting windows

Snakemake

# Count short and long fragments intersecting kept genomic windows

rule frag_window_count:
    benchmark: benchdir + "/{library}_cfdna_wgs_frag_window_int.benchmark.txt",
    container: cfdna_wgs_container,
    input:
        short = cfdna_wgs_frag_beds + "/{library}_norm_short.bed",
        long = cfdna_wgs_frag_beds + "/{library}_norm_long.bed",
        matbed = refdir + "/keep_5mb.bed",
    log: logdir + "/{library}_cfdna_wgs_frag_window_int.log",
    output:
        short = cfdna_wgs_frag_counts + "/{library}_cnt_short.tmp",
        long = cfdna_wgs_frag_counts + "/{library}_cnt_long.tmp",
    params:
        script = cfdna_wgs_scriptdir + "/frag_window_int.sh",
        threads = threads,
    shell:
        """
        {params.script} \
        {input.short} \
        {input.matbed} \
        {output.short}
        {params.script} \
        {input.long} \
        {input.matbed} \
        {output.long}
        """

Shell script

#!/usr/bin/env bash
input=$1
keep_bed=$2
output=$3

bedtools intersect -c \
             -a $keep_bed \
             -b $input > $output

Merge counts across length and library

Snakemake

# Merge short and long fragment counts by genomic poistion for all libraries
rule cfdna_wgs_count_merge:
    benchmark: benchdir + "/cfdna_wgs_count_merge.benchmark.txt",
    container: cfdna_wgs_container,
    input: expand(cfdna_wgs_frag_counts + "/{library}_cnt_{length}.tmp",  library = CFDNA_WGS_LIBRARIES, length = ["short","long"]),
    log: logdir + "/cfdna_wgs_count_merge.log",
    output:  cfdna_wgs_frag + "/frag_counts.tsv",
    params:
        counts_dir = cfdna_wgs_frag + "/counts",
        script = cfdna_wgs_scriptdir + "/count_merge.sh",
        threads = cfdna_wgs_threads,
    shell:
        """
        {params.script} \
        {params.counts_dir} \
        {output} &> {log}
        """

Shell script

# For unit testing
#counts_dir="/home/jeszyman/mpnst/analysis/cfdna-wgs/frag/counts"
#out_tsv="/home/jeszyman/mpnst/analysis/cfdna-wgs/frag/frag_counts.tsv"

# Define variables
counts_dir="${1}"
out_tsv="${2}"

# Remove the existing aggregate file if present
if [ -f $out_tsv ]; then rm $out_tsv; fi
#touch $out_tsv

# Make aggregate file
for file in ${counts_dir}/*;
do
    # Add file name to each line
    awk '{{print FILENAME (NF?"\t":"") $0}}' $file |
        # Modify file name to library id
        sed 's/^.*lib/lib/g' |
        sed 's/_.*_/\t/g' |
        # Cleanup "tmp"
        sed 's/.tmp//g' |
        # Send to output
        sed 's/\.bed//g' >> $out_tsv
done

# Add a header
sed -i  '1 i\library	len_class	chr	start	end	gc	count' $out_tsv

#!/usr/bin/env bash
output=$1
declare -a array2=$2

if [ -f $output ]; then \rm $output; fi

for file in ${array2[@]}; do
    awk '{{print FILENAME (NF?"\t":"") $0}}' $file |
        sed 's/^.*lib/lib/g' |
        sed 's/_.*_/\t/g' |
        # Cleanup "tmp"
        sed 's/.tmp//g' |
        sed 's/\.bed//g' >> $output
done

# Add a header
sed -i  '1 i\library	len_class	chr	start	end	count' $out_tsv

Make a zero-centered, unit SD fragment file

Snakemake

rule unit_cent_sd:
    benchmark: benchdir + "/unit_cent_sd.benchmark.txt",
    container: cfdna_wgs_container,
    input: cfdna_wgs_frag + "/frag_counts.tsv",
    log: logdir + "/unit_cent_sd.log",
    output: cfdna_wgs_frag + "/ratios.tsv",
    params:
        script = cfdna_wgs_scriptdir + "/make_ratios.R",
    shell:
        """
        Rscript {params.script} \
        {input} {output} > {log} 2>&1
        """

Rscript

#!/usr/bin/env Rscript

# For unit testing
frags_tsv = "test/analysis/cfdna_wgs/frag/frag_counts.tsv"
ratios_tsv = "/home/jeszyman/mpnst/analysis/cfdna-wgs/frag/ratios.tsv"

args = commandArgs(trailingOnly = TRUE)
frags_tsv = args[1]
ratios_tsv = args[2]

# Load necessary packages
library(tidyverse)

# Load aggregate frag tsv
frags = read_tsv(frags_tsv)

# From per-position, per library short and long fragment counts, zero-centered fragment ratio
# See https://github.com/cancer-genomics/reproduce_lucas_wflow/blob/master/analysis/fig2a.Rmd

ratios =
  frags %>%
  mutate_at(vars(start, end, count), as.numeric) %>%
  # Put lib-bin short and long values on same row in order to make per-row ratios
  pivot_wider(names_from = len_class, values_from = count, values_fn = function(x) mean(x)) %>%
  mutate(fract = short/long) %>%
  select(library, chr, start, end, fract) %>%
  # Zero center by library
  group_by(library) %>%
  mutate(ratio.centered = scale(fract, scale=F)[,1])

write_tsv(ratios, file = ratios_tsv)

Reference

Based on cfDNA fragmentomics cite:mathios2021

Development

Ideas

CNA and fragmentomics integration testing

Preamble

#########1#########2#########3#########4#########5#########6#########7#########8
#                                                                              #
#      Integration Testing Snakefile for Analysis of Cell-free DNA             #
#    Whole Genome Sequencing Copy Number Alteration and Fragmentomics          #
#                                                                              #
#########1#########2#########3#########4#########5#########6#########7#########8

# Load necessary packages for snakemake run
import pandas as pd
import re
import numpy as np

Variable naming

# Variable naming
benchdir = config["benchdir"]
cfdna_wgs_repo = config["cfdna_wgs_repo"]
cfdna_wgs_scriptdir = config["cfdna_wgs_scriptdir"]
logdir = config["logdir"]
threads = config["threads"]

# Suggested directory structure:
analysis = config["datadir"] + "/analysis"
cfdna_wgs = config["datadir"]      + "/analysis/cfdna_wgs"
cfdna_wgs_cna = config["datadir"]  + "/analysis/cfdna_wgs/cna"
cfdna_wgs_frag = config["datadir"] + "/analysis/cfdna_wgs/frag"

# Terminal variable paths:
#  (These variables are used directly in the cna snakefile)
cfdna_wgs_cna_in_bams      = cfdna_wgs_cna + "/input_bams"
cfdna_wgs_cna_frag_bams    = cfdna_wgs_cna + "/frag_bams"
cfdna_wgs_cna_wigs         = cfdna_wgs_cna + "/wigs"
cfdna_wgs_cna_ichor_nopon  = cfdna_wgs_cna + "/ichor_nopon"

cfdna_wgs_frag_input_bams  = cfdna_wgs_cna + "/input_bams"
cfdna_wgs_frag_beds       = cfdna_wgs_frag + "/beds"

cfdna_wgs_frag_counts     = cfdna_wgs_frag + "/counts"

refdir                 = config["datadir"] + "/ref"

# Additional variable names used directly in the cna snakefile:
chrom_sizes = config["chrom_sizes"]
genome_fasta = "/mnt/ris/aadel/Active/mpnst/inputs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna"


#TMP_FRAG_LIBS = ["lib001_filt","lib002_filt"]

#chrs = "chr8"

chrs = "chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY,chrM",

keep_bed = refdir + "/hg38_keep.bed",
blklist = config["blklist"]
genome_ref = config["genome_ref"]


FRAG_DISTROS = config["frag_distro"]

cfdna_wgs_threads = config["threads"]
cfdna_wgs_scriptdir = config["cfdna_wgs_scriptdir"]


cfdna_wgs_container = config["cfdna_wgs_container"]
default_container = config["default_container"]

autosome_bed = refdir + "/hg38_autosomes.bed",
cfdna_wgs_fastqs = cfdna_wgs + "/fastqs"
cfdna_wgs_bams = cfdna_wgs + "/bams"
qc = config["datadir"] + "/qc"

# cfdna_wgs_container = config["cfdna_wgs_container"]


# cfdna_wgs_cna_bam_inputs   = config["dir"]["data"] + "/bam/filt"
# cfdna_wgs_cna_bam_fragfilt = config["dir"]["data"] + "/bam/frag"

# wig = config["dir"]["data"] + "/wig"
# ichor = config["dir"]["data"] + "/ichor"
# cfdna_wgs_logs = config["dir"]["data"] + "logs/cfdna_wgs"
# ichor_nopon = config["dir"]["data"] + "/ichor_nopon"

Functions

libraries = pd.read_table(config["datadir"] + "/inputs/libraries.tsv")

readable = []
for x in libraries.file:
    readable.append(os.access(x, os.R_OK))
libraries['readable']=readable

cfdna_libraries = libraries
cfdna_libraries = cfdna_libraries[cfdna_libraries.library_type == "wgs"]
cfdna_libraries = cfdna_libraries[cfdna_libraries.isolation_type == "cfdna"]
cfdna_libraries = cfdna_libraries[cfdna_libraries.readable == True]

library_indict = cfdna_libraries["library"].tolist()
file_indict = cfdna_libraries["file"].tolist()
lib_dict = dict(zip(library_indict, file_indict))

CFDNA_WGS_LIBRARIES = list(lib_dict.keys())

cna_libraries = pd.read_table(config["datadir"] + "/inputs/cna_libraries.tsv")

readable = []
for x in cna_libraries.bam_file:
    readable.append(os.access(x, os.R_OK))
cna_libraries['readable']=readable

cna_libraries = cna_libraries[cna_libraries.readable == True]

library_indict = cna_libraries["library"].tolist()
file_indict = cna_libraries["bam_file"].tolist()
lib_dict = dict(zip(library_indict, file_indict))

CNA_WGS_LIBRARIES = list(lib_dict.keys())

All rule

rule all:
    input:
# # From this snakefile:
#         # cfdna_wgs_symlink:
#         expand(cfdna_wgs_cna_in_bams +
#                "/{library}.bam",
#                library = lib_dict.keys()),
# # From cna.smk
#         # cna_frag_filt:
#         expand(cfdna_wgs_cna_frag_bams +
#                "/{library}_frag{frag_distro}.bam",
#                library = CNA_WGS_LIBRARIES,
#                frag_distro = FRAG_DISTROS),
#         # bam_to_wig:
#         expand(cfdna_wgs_cna_wigs +
#                "/{library}_frag{frag_distro}.wig",
#                library = CNA_WGS_LIBRARIES,
#                frag_distro = FRAG_DISTROS),
#         # ichor_nopon:
#         expand(cfdna_wgs_cna_ichor_nopon +
#                "/{library}_frag{frag_distro}.cna.seg",
#                library = CNA_WGS_LIBRARIES,
#                frag_distro = FRAG_DISTROS),
# From frag.smk
        # make_gc_map_bind:
        refdir + "/keep_5mb.bed",
        # filt_bam_to_frag_bed:
        expand(cfdna_wgs_frag_beds +
               "/{library}_filt.bed",
               library = CNA_WGS_LIBRARIES),
        # # gc_distro:
        # expand(cfdna_wgs_frag_gc_distros +
        #        "/{library}_gc_distro.csv",
        #        library = CNA_WGS_LIBRARIES),
        # # healthy_gc:
        # cfdna_wgs_frag_gc_distros + "/healthy_med.rds",
        # #
        # expand(cfdna_wgs_frag_beds +
        #        "/{library}_sampled_frag.bed",
        #       library = CNA_WGS_LIBRARIES),
        # expand(cfdna_wgs_frag_beds) /
        #        "{library}_norm_{length}.bed",
        #        library = CNA_WGS_LIBRARIES,
        #        length = ["short", "long"]),
        expand(cfdna_wgs_frag_counts +
               "/{library}_cnt_{length}.tmp",
               library = CNA_WGS_LIBRARIES,
               length = ["short", "long"]),
        cfdna_wgs_frag + "/frag_counts.tsv",
        #
        # unit_cent_sd:
        cfdna_wgs_frag + "/ratios.tsv",

Symlink input bams

Snakemake

# Symlink input bams
rule cfdna_wgs_symlink:
    container: cfdna_wgs_container,
    input: lambda wildcards: lib_dict[wildcards.library],
    output: cfdna_wgs_cna_in_bams + "/{library}.bam",
    shell:
        """
        ln --force --relative --symbolic {input} {output}
        """

Includes statements

include: cfdna_wgs_repo + "/workflow/reads.smk"
include: cfdna_wgs_repo + "/workflow/cna.smk"
include: cfdna_wgs_repo + "/workflow/frag.smk"

README

Introduction

This repository hosts a snakemake workflow for basic processing of whole-genome sequencing reads from cell-free DNA.

Organization

Master branch of the repository contains most recent developments while stable versions are saved as terminal branches (e.g. stable.1.0.0).

Directory workflow contains two types of workflows- process-focused snakefiles (reads.smk, cna.smk, frag.smk) suitable for integration into another snakemake pipeline using the :include command, and the _int_test snakefile with examples of such integration using the repository test data.

Use

All software needed for the pipeline is present within the associated docker container (see docker and https://hub.docker.com/repository/docker/jeszyman/cfdna_wgs/general).
See the example configuration yaml config/int_test.yaml and wrapper workflow workflow/int_test for necessary run conditions.

Changelog

[2023-01-26 Thu] - Version 9.1.0: Repo cleanup
[2023-01-26 Thu] - Version 9.0.0: Removed -f 3 flag for perfectly matched pairs in samtools filtering as the black from BWA removes some fragments at a set max length. Added framework for benchmark analysis. Added conditional execution of downsampling. Removed (temporarily) final wig and ichor commands of CNA as these don’t currently run correctly without full genome alignment, so can’t be validated on test data. Added local documentation of cfdna-wgs dockerfile.
[2023-01-21 Sat] - Version 8.0.0: Corrected rule filt_bam_to_frag_bed to fix mates of inputs, which seems to prevent errors in the bamtobed call. Frag_window_count now uses windows of consistent 5 Mb size, which are generated from rule make_gc_map_bind. Added a merged fragment counts file and zero-centered unit SD counts.
[2022-12-07 Wed] - Version 7.0.0: Added copy number alteration and DELFI fragmentomics.
[2022-10-17 Mon] - Version 6.0.0: Using fastp for read trimming (replaces trimmomatic). Simplified naming schema. Removed downsampling (will reinstate in later version).
[2022-09-08 Thu] - Version 5.3.0: some minor name changes
[2022-08-19 Fri] - Version 5.2.0 validated: Adds bamCoverage and plotCoverage from deeptools. Benchmarks BWA.
[2022-08-09 Tue] - Version 5.1.0 validated: Added cfdna wgs-specific container for each rule, referenced to config
[2022-08-05 Fri] - Version 5.0.0 validated: Added a symlink rule based on python dictionary. Added repo-specific output naming, added checks for sequence type and file readability to input tsv.
[2022-06-27 Mon] - Version 4 validated. Further expanded read_qc.tsv table. Removed bam post-processing step and added a more expansive bam filtering step. Updated downsampling to work off filtered alignments.
[2022-06-26 Sun] - Version 3.2 validated. Expanded the qc aggregate table and added some comments.
[2022-06-24 Fri] - Validate version 3.1 which includes genome index build as a snakefile rule.
[2022-06-24 Fri] - Validated version 3 with read number checkpoint for down-sampling.
[2022-05-31 Tue] - Conforms to current biotools best practices.
[2022-04-29 Fri] - Moved multiqc to integration testing as inputs are dependent on final sample labels. Integration testing works per this commit.

Development

Version 9

kill v7- not working for CNA

Project stable version update

Make benchmarking table

for file in ./*; do base=$(basename $file); $(str = tail -n1 $file); echo $base $str; done

Analysis of copy number alteration

Preamble

#########1#########2#########3#########4#########5#########6#########7#########8
#                                                                              #
#                    Copy-number Alteration Analysis of                        #
#                  Cell-free DNA Whole Genome Sequencing                       #
#                                                                              #
#                                                                              #
#########1#########2#########3#########4#########5#########6#########7#########8

Convert bam to wig

Snakemake

# Use readCounter to create windowed wig from bam file
rule bam_to_wig:
    benchmark: benchdir + "/{library}_ds{downsample}_{frag_distro}_cfdna_wgs_bam_to_wig.benchmark.txt",
    container: cfdna_wgs_container,
    input: cfdna_wgs_bams + "/{library}_ds{downsample}_frag{frag_distro}.bam",
    log: logdir + "/{library}_ds{downsample}_{frag_distro}_cfdna_wgs_bam_to_wig.log",
    output: cfdna_wgs_wigs + "/{library}_ds{downsample}_frag{frag_distro}.wig",
    params:
        chrs = chrs,
        outdir = cfdna_wgs_wigs,
        script = cfdna_wgs_scriptdir + "/bam_to_wig.sh",
        threads = cfdna_wgs_threads,
    shell:
        """
        mkdir -p {params.outdir}
        /opt/hmmcopy_utils/bin/readCounter \
        --chromosome "{params.chrs}" \
        --quality 20 \
        --window 1000000 \
        {input} > {output}
        """

Shell script

#!/usr/bin/env bash
input=$1
output=$2

        /opt/hmmcopy_utils/bin/readCounter --window 1000000 --quality 20 \
        --chromosome {params.chrs} \
        {input} > {output}

Run ichorCNA without a panel of normals

Snakemake

# Run ichorCNA without a panel of normals
rule ichor_nopon:
    input: cfdna_wgs_wigs + "/{library}_ds{downsample}_frag{frag_distro}.wig",
    output: cfdna_wgs_ichor_nopon + "/{library}_ds{downsample}_frag{frag_distro}.cna.seg",
    params:
        script = cfdna_wgs_scriptdir + "/MOD_runIchorCNA.R",
        out_dir = cfdna_wgs_ichor_nopon,
    container:
        cfdna_wgs_container,
    shell:
        """
        Rscript {params.script} \
         --id {wildcards.library}_frag{wildcards.frag_distro} \
         --WIG {input} \
         --gcWig /opt/ichorCNA/inst/extdata/gc_hg38_1000kb.wig \
         --mapWig /opt/ichorCNA/inst/extdata/map_hg38_1000kb.wig \
         --centromere /opt/ichorCNA/inst/extdata/GRCh38.GCA_000001405.2_centromere_acen.txt \
         --normal "c(0.95, 0.99, 0.995, 0.999)" \
         --ploidy "c(2)" \
         --maxCN 3 \
         --estimateScPrevalence FALSE \
         --scStates "c()" \
         --outDir {params.out_dir}
        """

Rscript

# file:   ichorCNA.R
# authors: Gavin Ha, Ph.D.
#          Fred Hutch
# contact: <gha@fredhutch.org>
#
#         Justin Rhoades
#          Broad Institute
# contact: <rhoades@broadinstitute.org>

# ichorCNA: https://github.com/broadinstitute/ichorCNA
# date:   July 24, 2019
# description: Hidden Markov model (HMM) to analyze Ultra-low pass whole genome sequencing (ULP-WGS) data.
# This script is the main script to run the HMM.

library(optparse)

option_list <- list(
  make_option(c("--WIG"), type = "character", help = "Path to tumor WIG file. Required."),
  make_option(c("--NORMWIG"), type = "character", default=NULL, help = "Path to normal WIG file. Default: [%default]"),
  make_option(c("--gcWig"), type = "character", help = "Path to GC-content WIG file; Required"),
  make_option(c("--mapWig"), type = "character", default=NULL, help = "Path to mappability score WIG file. Default: [%default]"),
  make_option(c("--normalPanel"), type="character", default=NULL, help="Median corrected depth from panel of normals. Default: [%default]"),
  make_option(c("--exons.bed"), type = "character", default=NULL, help = "Path to bed file containing exon regions. Default: [%default]"),
  make_option(c("--id"), type = "character", default="test", help = "Patient ID. Default: [%default]"),
  make_option(c("--centromere"), type="character", default=NULL, help = "File containing Centromere locations; if not provided then will use hg19 version from ichorCNA package. Default: [%default]"),
  make_option(c("--minMapScore"), type = "numeric", default=0.9, help="Include bins with a minimum mappability score of this value. Default: [%default]."),
  make_option(c("--rmCentromereFlankLength"), type="numeric", default=1e5, help="Length of region flanking centromere to remove. Default: [%default]"),
  make_option(c("--normal"), type="character", default="0.5", help = "Initial normal contamination; can be more than one value if additional normal initializations are desired. Default: [%default]"),
  make_option(c("--scStates"), type="character", default="NULL", help = "Subclonal states to consider. Default: [%default]"),
  make_option(c("--coverage"), type="numeric", default=NULL, help = "PICARD sequencing coverage. Default: [%default]"),
  make_option(c("--lambda"), type="character", default="NULL", help="Initial Student's t precision; must contain 4 values (e.g. c(1500,1500,1500,1500)); if not provided then will automatically use based on variance of data. Default: [%default]"),
  make_option(c("--lambdaScaleHyperParam"), type="numeric", default=3, help="Hyperparameter (scale) for Gamma prior on Student's-t precision. Default: [%default]"),
  #	make_option(c("--kappa"), type="character", default=50, help="Initial state distribution"),
  make_option(c("--ploidy"), type="character", default="2", help = "Initial tumour ploidy; can be more than one value if additional ploidy initializations are desired. Default: [%default]"),
  make_option(c("--maxCN"), type="numeric", default=7, help = "Total clonal CN states. Default: [%default]"),
  make_option(c("--estimateNormal"), type="logical", default=TRUE, help = "Estimate normal. Default: [%default]"),
  make_option(c("--estimateScPrevalence"), type="logical", default=TRUE, help = "Estimate subclonal prevalence. Default: [%default]"),
  make_option(c("--estimatePloidy"), type="logical", default=TRUE, help = "Estimate tumour ploidy. Default: [%default]"),
  make_option(c("--maxFracCNASubclone"), type="numeric", default=0.7, help="Exclude solutions with fraction of subclonal events greater than this value. Default: [%default]"),
  make_option(c("--maxFracGenomeSubclone"), type="numeric", default=0.5, help="Exclude solutions with subclonal genome fraction greater than this value. Default: [%default]"),
  make_option(c("--minSegmentBins"), type="numeric", default=50, help="Minimum number of bins for largest segment threshold required to estimate tumor fraction; if below this threshold, then will be assigned zero tumor fraction."),
  make_option(c("--altFracThreshold"), type="numeric", default=0.05, help="Minimum proportion of bins altered required to estimate tumor fraction; if below this threshold, then will be assigned zero tumor fraction. Default: [%default]"),
  make_option(c("--chrNormalize"), type="character", default="c(1:22)", help = "Specify chromosomes to normalize GC/mappability biases. Default: [%default]"),
  make_option(c("--chrTrain"), type="character", default="c(1:22)", help = "Specify chromosomes to estimate params. Default: [%default]"),
  make_option(c("--chrs"), type="character", default="c(1:22,\"X\")", help = "Specify chromosomes to analyze. Default: [%default]"),
  make_option(c("--genomeBuild"), type="character", default="hg19", help="Geome build. Default: [%default]"),
  make_option(c("--genomeStyle"), type = "character", default = "NCBI", help = "NCBI or UCSC chromosome naming convention; use UCSC if desired output is to have \"chr\" string. [Default: %default]"),
  make_option(c("--normalizeMaleX"), type="logical", default=TRUE, help = "If male, then normalize chrX by median. Default: [%default]"),
  make_option(c("--minTumFracToCorrect"), type="numeric", default=0.1, help = "Tumor-fraction correction of bin and segment-level CNA if sample has minimum estimated tumor fraction. [Default: %default]"),
  make_option(c("--fracReadsInChrYForMale"), type="numeric", default=0.001, help = "Threshold for fraction of reads in chrY to assign as male. Default: [%default]"),
  make_option(c("--includeHOMD"), type="logical", default=FALSE, help="If FALSE, then exclude HOMD state. Useful when using large bins (e.g. 1Mb). Default: [%default]"),
  make_option(c("--txnE"), type="numeric", default=0.9999999, help = "Self-transition probability. Increase to decrease number of segments. Default: [%default]"),
  make_option(c("--txnStrength"), type="numeric", default=1e7, help = "Transition pseudo-counts. Exponent should be the same as the number of decimal places of --txnE. Default: [%default]"),
  make_option(c("--plotFileType"), type="character", default="pdf", help = "File format for output plots. Default: [%default]"),
	make_option(c("--plotYLim"), type="character", default="c(-2,2)", help = "ylim to use for chromosome plots. Default: [%default]"),
  make_option(c("--outDir"), type="character", default="./", help = "Output Directory. Default: [%default]"),
  make_option(c("--libdir"), type = "character", default=NULL, help = "Script library path. Usually exclude this argument unless custom modifications have been made to the ichorCNA R package code and the user would like to source those R files. Default: [%default]")
)
parseobj <- OptionParser(option_list=option_list)
opt <- parse_args(parseobj)
print(opt)
options(scipen=0, stringsAsFactors=F)

library(HMMcopy)
library(GenomicRanges)
library(GenomeInfoDb)
options(stringsAsFactors=FALSE)
options(bitmapType='cairo')

patientID <- opt$id
tumour_file <- opt$WIG
normal_file <- opt$NORMWIG
gcWig <- opt$gcWig
mapWig <- opt$mapWig
normal_panel <- opt$normalPanel
exons.bed <- opt$exons.bed  # "0" if none specified
centromere <- opt$centromere
minMapScore <- opt$minMapScore
flankLength <- opt$rmCentromereFlankLength
normal <- eval(parse(text = opt$normal))
scStates <- eval(parse(text = opt$scStates))
lambda <- eval(parse(text = opt$lambda))
lambdaScaleHyperParam <- opt$lambdaScaleHyperParam
estimateNormal <- opt$estimateNormal
estimatePloidy <- opt$estimatePloidy
estimateScPrevalence <- opt$estimateScPrevalence
maxFracCNASubclone <- opt$maxFracCNASubclone
maxFracGenomeSubclone <- opt$maxFracGenomeSubclone
minSegmentBins <- opt$minSegmentBins
altFracThreshold <- opt$altFracThreshold
ploidy <- eval(parse(text = opt$ploidy))
coverage <- opt$coverage
maxCN <- opt$maxCN
txnE <- opt$txnE
txnStrength <- opt$txnStrength
normalizeMaleX <- as.logical(opt$normalizeMaleX)
includeHOMD <- as.logical(opt$includeHOMD)
minTumFracToCorrect <- opt$minTumFracToCorrect
fracReadsInChrYForMale <- opt$fracReadsInChrYForMale
chrXMedianForMale <- -0.1
outDir <- opt$outDir
libdir <- opt$libdir
plotFileType <- opt$plotFileType
plotYLim <- eval(parse(text=opt$plotYLim))
gender <- NULL
outImage <- paste0(outDir,"/", patientID,".RData")
genomeBuild <- opt$genomeBuild
genomeStyle <- opt$genomeStyle
chrs <- as.character(eval(parse(text = opt$chrs)))
chrTrain <- as.character(eval(parse(text=opt$chrTrain)));
chrNormalize <- as.character(eval(parse(text=opt$chrNormalize)));
seqlevelsStyle(chrs) <- genomeStyle
seqlevelsStyle(chrNormalize) <- genomeStyle
seqlevelsStyle(chrTrain) <- genomeStyle

## load ichorCNA library or source R scripts
if (!is.null(libdir) && libdir != "None"){
	source(paste0(libdir,"/R/utils.R"))
	source(paste0(libdir,"/R/segmentation.R"))
	source(paste0(libdir,"/R/EM.R"))
	source(paste0(libdir,"/R/output.R"))
	source(paste0(libdir,"/R/plotting.R"))
} else {
    library(ichorCNA)
}

## load seqinfo
seqinfo <- getSeqInfo(genomeBuild, genomeStyle)

if (substr(tumour_file,nchar(tumour_file)-2,nchar(tumour_file)) == "wig") {
  wigFiles <- data.frame(cbind(patientID, tumour_file))
} else {
  wigFiles <- read.delim(tumour_file, header=F, as.is=T)
}

## FILTER BY EXONS IF PROVIDED ##
## add gc and map to GRanges object ##
if (is.null(exons.bed) || exons.bed == "None" || exons.bed == "NULL"){
  targetedSequences <- NULL
}else{
  targetedSequences <- read.delim(exons.bed, header=T, sep="\t")
}

## load PoN
if (is.null(normal_panel) || normal_panel == "None" || normal_panel == "NULL"){
	normal_panel <- NULL
}

if (is.null(centromere) || centromere == "None" || centromere == "NULL"){ # no centromere file provided
	centromere <- system.file("extdata", "GRCh37.p13_centromere_UCSC-gapTable.txt",
			package = "ichorCNA")
}
centromere <- read.delim(centromere,header=T,stringsAsFactors=F,sep="\t")
save.image(outImage)
## LOAD IN WIG FILES ##
numSamples <- nrow(wigFiles)

tumour_copy <- list()
for (i in 1:numSamples) {
  id <- wigFiles[i,1]
  ## create output directories for each sample ##
  dir.create(paste0(outDir, "/", id, "/"), recursive = TRUE)
  ### LOAD TUMOUR AND NORMAL FILES ###
  message("Loading tumour file:", wigFiles[i,1])
  tumour_reads <- wigToGRanges(wigFiles[i,2])

  ## LOAD GC/MAP WIG FILES ###
  # find the bin size and load corresponding wig files #
  binSize <- as.data.frame(tumour_reads[1,])$width
  message("Reading GC and mappability files")
  if (is.null(gcWig) || gcWig == "None" || gcWig == "NULL"){
      stop("GC wig file is required")
  }
  gc <- wigToGRanges(gcWig)
  if (is.null(mapWig) || mapWig == "None" || mapWig == "NULL"){
      message("No mappability wig file input, excluding from correction")
      map <- NULL
  } else {
      map <- wigToGRanges(mapWig)
  }
  message("Correcting Tumour")

  counts <- loadReadCountsFromWig(tumour_reads, chrs = chrs, gc = gc, map = map,
                                       centromere = centromere, flankLength = flankLength,
                                       targetedSequences = targetedSequences, chrXMedianForMale = chrXMedianForMale,
                                       genomeStyle = genomeStyle, fracReadsInChrYForMale = fracReadsInChrYForMale,
                                       chrNormalize = chrNormalize, mapScoreThres = minMapScore)
  tumour_copy[[id]] <- counts$counts #as(counts$counts, "GRanges")
  gender <- counts$gender
  ## load in normal file if provided
  if (!is.null(normal_file) && normal_file != "None" && normal_file != "NULL"){
	message("Loading normal file:", normal_file)
	normal_reads <- wigToGRanges(normal_file)
	message("Correcting Normal")
	counts <- loadReadCountsFromWig(normal_reads, chrs=chrs, gc=gc, map=map,
			centromere=centromere, flankLength = flankLength, targetedSequences=targetedSequences,
			genomeStyle = genomeStyle, chrNormalize = chrNormalize, mapScoreThres = minMapScore)
	normal_copy <- counts$counts #as(counts$counts, "GRanges")
	gender.normal <- counts$gender
  }else{
	normal_copy <- NULL
  }

  ### DETERMINE GENDER ###
  ## if normal file not given, use chrY, else use chrX
  message("Determining gender...", appendLF = FALSE)
  gender.mismatch <- FALSE
  if (!is.null(normal_copy)){
	if (gender$gender != gender.normal$gender){ #use tumour # use normal if given
	# check if normal is same gender as tumour
	  gender.mismatch <- TRUE
	}
  }
  message("Gender ", gender$gender)

  ## NORMALIZE GENOME-WIDE BY MATCHED NORMAL OR NORMAL PANEL (MEDIAN) ##
  tumour_copy[[id]] <- normalizeByPanelOrMatchedNormal(tumour_copy[[id]], chrs = chrs,
      normal_panel = normal_panel, normal_copy = normal_copy,
      gender = gender$gender, normalizeMaleX = normalizeMaleX)

	### OUTPUT FILE ###
	### PUTTING TOGETHER THE COLUMNS IN THE OUTPUT ###
	outMat <- as.data.frame(tumour_copy[[id]])
	#outMat <- outMat[,c(1,2,3,12)]
	outMat <- outMat[,c("seqnames","start","end","copy")]
	colnames(outMat) <- c("chr","start","end","log2_TNratio_corrected")
	outFile <- paste0(outDir,"/",id,".correctedDepth.txt")
	message(paste("Outputting to:", outFile))
	write.table(outMat, file=outFile, row.names=F, col.names=T, quote=F, sep="\t")

} ## end of for each sample

chrInd <- as.character(seqnames(tumour_copy[[1]])) %in% chrTrain
## get positions that are valid
valid <- tumour_copy[[1]]$valid
if (length(tumour_copy) >= 2) {
  for (i in 2:length(tumour_copy)){
    valid <- valid & tumour_copy[[i]]$valid
  }
}
save.image(outImage)

### RUN HMM ###
## store the results for different normal and ploidy solutions ##
ptmTotalSolutions <- proc.time() # start total timer
results <- list()
loglik <- as.data.frame(matrix(NA, nrow = length(normal) * length(ploidy), ncol = 7,
                 dimnames = list(c(), c("init", "n_est", "phi_est", "BIC",
                 												"Frac_genome_subclonal", "Frac_CNA_subclonal", "loglik"))))
counter <- 1
compNames <- rep(NA, nrow(loglik))
mainName <- rep(NA, length(normal) * length(ploidy))
#### restart for purity and ploidy values ####
for (n in normal){
  for (p in ploidy){
    if (n == 0.95 & p != 2) {
        next
    }
    logR <- as.data.frame(lapply(tumour_copy, function(x) { x$copy })) # NEED TO EXCLUDE CHR X #
    param <- getDefaultParameters(logR[valid & chrInd, , drop=F], maxCN = maxCN, includeHOMD = includeHOMD,
                ct.sc=scStates, ploidy = floor(p), e=txnE, e.same = 50, strength=txnStrength)
    param$phi_0 <- rep(p, numSamples)
    param$n_0 <- rep(n, numSamples)

    ############################################
    ######## CUSTOM PARAMETER SETTINGS #########
    ############################################
    # 0.1x cfDNA #
    if (is.null(lambda)){
			logR.var <- 1 / ((apply(logR, 2, sd, na.rm = TRUE) / sqrt(length(param$ct))) ^ 2)
			param$lambda <- rep(logR.var, length(param$ct))
			param$lambda[param$ct %in% c(2)] <- logR.var
			param$lambda[param$ct %in% c(1,3)] <- logR.var
			param$lambda[param$ct >= 4] <- logR.var / 5
			param$lambda[param$ct == max(param$ct)] <- logR.var / 15
			param$lambda[param$ct.sc.status] <- logR.var / 10
    }else{
			param$lambda[param$ct %in% c(2)] <- lambda[2]
			param$lambda[param$ct %in% c(1)] <- lambda[1]
			param$lambda[param$ct %in% c(3)] <- lambda[3]
			param$lambda[param$ct >= 4] <- lambda[4]
			param$lambda[param$ct == max(param$ct)] <- lambda[2] / 15
			param$lambda[param$ct.sc.status] <- lambda[2] / 10
		}
		param$alphaLambda <- rep(lambdaScaleHyperParam, length(param$ct))
    # 1x bulk tumors #
    #param$lambda[param$ct %in% c(2)] <- 2000
    #param$lambda[param$ct %in% c(1)] <- 1750
    #param$lambda[param$ct %in% c(3)] <- 1750
    #param$lambda[param$ct >= 4] <- 1500
    #param$lambda[param$ct == max(param$ct)] <- 1000 / 25
		#param$lambda[param$ct.sc.status] <- 1000 / 75
		#param$alphaLambda[param$ct.sc.status] <- 4
		#param$alphaLambda[param$ct %in% c(1,3)] <- 5
		#param$alphaLambda[param$ct %in% c(2)] <- 5
		#param$alphaLambda[param$ct == max(param$ct)] <- 4

		#############################################
		################ RUN HMM ####################
		#############################################
    hmmResults.cor <- HMMsegment(tumour_copy, valid, dataType = "copy",
                                 param = param, chrTrain = chrTrain, maxiter = 50,
                                 estimateNormal = estimateNormal, estimatePloidy = estimatePloidy,
                                 estimateSubclone = estimateScPrevalence, verbose = TRUE)

    for (s in 1:numSamples){
  		iter <- hmmResults.cor$results$iter
  		id <- names(hmmResults.cor$cna)[s]

  		## convert full diploid solution (of chrs to train) to have 1.0 normal or 0.0 purity
  		## check if there is an altered segment that has at least a minimum # of bins
  		segsS <- hmmResults.cor$results$segs[[s]]
  		segsS <- segsS[segsS$chr %in% chrTrain, ]
  		segAltInd <- which(segsS$event != "NEUT")
  		maxBinLength = -Inf
  		if (sum(segAltInd) > 0){
  			maxInd <- which.max(segsS$end[segAltInd] - segsS$start[segAltInd] + 1)
  			maxSegRD <- GRanges(seqnames=segsS$chr[segAltInd[maxInd]],
  								ranges=IRanges(start=segsS$start[segAltInd[maxInd]], end=segsS$end[segAltInd[maxInd]]))
  			hits <- findOverlaps(query=maxSegRD, subject=tumour_copy[[s]][valid, ])
  			maxBinLength <- length(subjectHits(hits))
  		}
  		## check if there are proportion of total bins altered
  		# if segment size smaller than minSegmentBins, but altFrac > altFracThreshold, then still estimate TF
  		cnaS <- hmmResults.cor$cna[[s]]
  		altInd <- cnaS[cnaS$chr %in% chrTrain, "event"] == "NEUT"
  		altFrac <- sum(!altInd, na.rm=TRUE) / length(altInd)
  		if ((maxBinLength <= minSegmentBins) & (altFrac <= altFracThreshold)){
  			hmmResults.cor$results$n[s, iter] <- 1.0
  		}

      # correct integer copy number based on estimated purity and ploidy
      correctedResults <- correctIntegerCN(cn = hmmResults.cor$cna[[s]],
            segs = hmmResults.cor$results$segs[[s]],
            purity = 1 - hmmResults.cor$results$n[s, iter], ploidy = hmmResults.cor$results$phi[s, iter],
            cellPrev = 1 - hmmResults.cor$results$sp[s, iter],
            maxCNtoCorrect.autosomes = maxCN, maxCNtoCorrect.X = maxCN, minPurityToCorrect = minTumFracToCorrect,
            gender = gender$gender, chrs = chrs, correctHOMD = includeHOMD)
      hmmResults.cor$results$segs[[s]] <- correctedResults$segs
      hmmResults.cor$cna[[s]] <- correctedResults$cn

      	## plot solution ##
  		outPlotFile <- paste0(outDir, "/", id, "/", id, "_genomeWide_", "n", n, "-p", p)
  		mainName[counter] <- paste0(id, ", n: ", n, ", p: ", p, ", log likelihood: ", signif(hmmResults.cor$results$loglik[hmmResults.cor$results$iter], digits = 4))
  		plotGWSolution(hmmResults.cor, s=s, outPlotFile=outPlotFile, plotFileType=plotFileType,
            logR.column = "logR", call.column = "Corrected_Call",
  					 plotYLim=plotYLim, estimateScPrevalence=estimateScPrevalence, seqinfo=seqinfo, main=mainName[counter])
    }
    iter <- hmmResults.cor$results$iter
    results[[counter]] <- hmmResults.cor
    loglik[counter, "loglik"] <- signif(hmmResults.cor$results$loglik[iter], digits = 4)
    subClonalBinCount <- unlist(lapply(hmmResults.cor$cna, function(x){ sum(x$subclone.status) }))
    fracGenomeSub <- subClonalBinCount / unlist(lapply(hmmResults.cor$cna, function(x){ nrow(x) }))
    fracAltSub <- subClonalBinCount / unlist(lapply(hmmResults.cor$cna, function(x){ sum(x$copy.number != 2) }))
    fracAltSub <- lapply(fracAltSub, function(x){if (is.na(x)){0}else{x}})
    loglik[counter, "Frac_genome_subclonal"] <- paste0(signif(fracGenomeSub, digits=2), collapse=",")
    loglik[counter, "Frac_CNA_subclonal"] <- paste0(signif(as.numeric(fracAltSub), digits=2), collapse=",")
    loglik[counter, "init"] <- paste0("n", n, "-p", p)
    loglik[counter, "n_est"] <- paste(signif(hmmResults.cor$results$n[, iter], digits = 2), collapse = ",")
    loglik[counter, "phi_est"] <- paste(signif(hmmResults.cor$results$phi[, iter], digits = 4), collapse = ",")

    counter <- counter + 1
  }
}
## get total time for all solutions ##
elapsedTimeSolutions <- proc.time() - ptmTotalSolutions
message("Total ULP-WGS HMM Runtime: ", format(elapsedTimeSolutions[3] / 60, digits = 2), " min.")

### SAVE R IMAGE ###
save.image(outImage)
#save(tumour_copy, results, loglik, file=paste0(outDir,"/",id,".RData"))

### SELECT SOLUTION WITH LARGEST LIKELIHOOD ###
loglik <- loglik[!is.na(loglik$init), ]
if (estimateScPrevalence){ ## sort but excluding solutions with too large % subclonal
	fracInd <- which(loglik[, "Frac_CNA_subclonal"] <= maxFracCNASubclone &
						 		   loglik[, "Frac_genome_subclonal"] <= maxFracGenomeSubclone)
	if (length(fracInd) > 0){ ## if there is a solution satisfying % subclonal
		ind <- fracInd[order(loglik[fracInd, "loglik"], decreasing=TRUE)]
	}else{ # otherwise just take largest likelihood
		ind <- order(as.numeric(loglik[, "loglik"]), decreasing=TRUE)
	}
}else{#sort by likelihood only
  ind <- order(as.numeric(loglik[, "loglik"]), decreasing=TRUE)
}

#new loop by order of solutions (ind)
outPlotFile <- paste0(outDir, "/", id, "/", id, "_genomeWide_all_sols")
for(i in 1:length(ind)) {
  hmmResults.cor <- results[[ind[i]]]
  turnDevOff <- FALSE
  turnDevOn <- FALSE
  if (i == 1){
  	turnDevOn <- TRUE
  }
  if (i == length(ind)){
  	turnDevOff <- TRUE
  }
  plotGWSolution(hmmResults.cor, s=s, outPlotFile=outPlotFile, plotFileType="pdf",
                     logR.column = "logR", call.column = "Corrected_Call",
                     plotYLim=plotYLim, estimateScPrevalence=estimateScPrevalence,
                     seqinfo = seqinfo,
                     turnDevOn = turnDevOn, turnDevOff = turnDevOff, main=mainName[ind[i]])
}

hmmResults.cor <- results[[ind[1]]]
hmmResults.cor$results$loglik <- as.data.frame(loglik)
hmmResults.cor$results$gender <- gender$gender
hmmResults.cor$results$chrYCov <- gender$chrYCovRatio
hmmResults.cor$results$chrXMedian <- gender$chrXMedian
hmmResults.cor$results$coverage <- coverage

outputHMM(cna = hmmResults.cor$cna, segs = hmmResults.cor$results$segs,
                      results = hmmResults.cor$results, patientID = patientID, outDir=outDir)
outFile <- paste0(outDir, "/", patientID, ".params.txt")
outputParametersToFile(hmmResults.cor, file = outFile)

Hold and test

Ideas

Filter fragments by length

Snakemake

# Filter fragments by length
rule cna_frag_filt_tmp:
    benchmark: benchdir + "/{library}_{frag_distro}_cfdna_wgs_frag_filt.benchmark.txt",
    container: cfdna_wgs_container,
    input: cfdna_wgs_cna_in_bams + "/{library}.bam",
    log: logdir + "/{library}_{frag_distro}_cfdna_wgs_frag_filt.log",
    output:
        nohead = temp(cfdna_wgs_cna_frag_bams) + "/{library}_frag{frag_distro}.nohead",
        onlyhead = temp(cfdna_wgs_cna_frag_bams) + "/{library}_frag{frag_distro}.only",
        final = cfdna_wgs_cna_frag_bams + "/{library}_frag{frag_distro}.bam",
    params:
        script = cfdna_wgs_scriptdir + "/frag_filt.sh",
        threads = cfdna_wgs_threads,
    shell:
        """
        frag_min=$(echo {wildcards.frag_distro} | sed -e "s/_.*$//g")
        frag_max=$(echo {wildcards.frag_distro} | sed -e "s/^.*_//g")
        {params.script} \
        {input} \
        {output.nohead} \
        $frag_min \
        $frag_max \
        {config[threads]} \
        {output.onlyhead} \
        {output.final}
        """

Reference

smk yas

Link to Snakefile

Ideas

Multiple frag distros

note- this would require re-write of ds_cond_target_list

Testing inputs update

Make a smaller fasta for indexing

#!/bin/echo For documentation, not intended to be executable:.
singularity shell ~/sing_containers/biotools.1.0.2.sif
repo=/home/jeszyman/repos/cfdna-wgs
wget --directory-prefix="${repo}/test/inputs/" https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz

zcat "test/inputs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz" | grep -A 1000000 chr8 | gzip > test/inputs/chr8.fa.gz

# Test indexed size
mkdir -p /tmp/testbwa
bwa index -p /tmp/testbwa/chr8 test/inputs/chr8.fa.gz

rm ${repo}/test/inputs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz

Make chr8-specific ichor references

singularity shell ~/sing_containers/cfdna_wgs.1.0.0.sif

~/wigToBigWig -clip /opt/ichorCNA/inst/extdata/gc_hg38_1000kb.wig test/inputs/hg38.chrom.sizes test/inputs/gc_chr8_1000kb.bw

bigWigToWig -chrom=chr8 test/inputs/gc_chr8_1000kb.bw test/inputs/gc_chr8_1000kb.wig

~/wigToBigWig -clip /opt/ichorCNA/inst/extdata/map_hg38_1000kb.wig test/inputs/hg38.chrom.sizes test/inputs/map_chr8_1000kb.bw

bigWigToWig -chrom=chr8 test/inputs/map_chr8_1000kb.bw test/inputs/map_chr8_1000kb.wig

~/wigToBigWig -clip /opt/ichorCNA/inst/extdata/gc_hg38_1000kb.wig test/inputs/hg38.chrom.sizes test/inputs/gc_chr8_1000kb.bw

bigWigToWig -chrom=chr8 test/inputs/gc_chr8_1000kb.bw test/inputs/gc_chr8_1000kb.wig

?

wget --directory-prefix="/home/jeszyman/repos/cfdna-wgs/test/inputs" https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.chrom.sizes

wget --directory-prefix="/home/jeszyman/repos/cfdna-wgs/test/inputs" https://raw.githubusercontent.com/Boyle-Lab/Blacklist/master/lists/hg38-blacklist.v2.bed.gz

gunzip -c ~/repos/cfdna-wgs/test/inputs/hg38-blacklist.v2.bed.gz > ~/repos/cfdna-wgs/test/inputs/hg38-blacklist.v2.bed

singularity shell --bind /mnt ~/sing_containers/cfdna_wgs.1.0.0.sif

# Clear bam directory if present
if [ -r test/bam ]; then \rm -rf test/bam; fi
mkdir -p test/bam

# Create small bam files to store in repo. Subsample real bams to ~100 Mb.
sambamba view -s .005 -f bam -t 36 /mnt/ris/aadel/Active/mpnst/test/bam/new_HiSeq15_L002001_ACAC_extract_ds20.bam > test/inputs/lib003_hg38.bam
sambamba view -s .005 -f bam -t 36 /mnt/ris/aadel/Active/mpnst/test/bam/new_HiSeq15_L002001_ATCG_extract_ds20.bam > test/inputs/lib004_hg38.bam
sambamba view -s 0.01 -f bam -t 4 /mnt/ris/aadel/Active/mpnst/bam/cfdna_wgs/ds/lib105_ds10.bam > test/inputs/lib005.bam
sambamba view -s 0.01 -f bam -t 4 /mnt/ris/aadel/Active/mpnst/bam/cfdna_wgs/ds/lib205_ds10.bam > test/inputs/lib006.bam

for file in test/inputs/*.bam; do samtools index $file; done

mkdir -p ~/repos/cfdna-wgs/test/analysis/cfdna_frag_bams
cp ~/repos/cfdna-frag/test/bam/frag/*.bam ~/repos/cfdna-wgs/test/analysis/cfdna_frag_bams/

singularity shell --bind /mnt ~/sing_containers/cfdna_wgs.1.0.0.sif

# Get hg38 gc bigwig
wget --directory-prefix /tmp/ http://hgdownload.cse.ucsc.edu/gbdb/hg38/bbi/gc5BaseBw/gc5Base.bw

# convert hg38 gc bigwig to tsv binned at 5 Mb (like Mathios, 2021)
multiBigwigSummary bins \
    --binSize 5000000 \
    --bwfiles /tmp/gc5Base.bw \
    --numberOfProcessors 4 \
    --outFileName /tmp/test.out \
    --outRawCounts /tmp/gc5mb.tsv

tail -n +2 /tmp/gc5mb.tsv > test/inputs/gc5mb.bed

bedtools subtract -a “test/inputs/chr8.bed” -b “test/inputs/hg38-blacklist.v2.bed” > “test/inputs/keep.bed”

Make example fastq more efficient-

Find reads that map to early 8 and work back into a fastq

filter CNA abberant regions

cite:dehner2021

Check / explain - Prerequisites for local integration testing

update /simplify aggregate qc table

expand seq depth metrics

using mosdepth

#########1#########2#########3#########4#########5#########6#########7#########8
#
### mosdepth for WGS depth calc  ###
#
# Setup
##

# Mosdepth per bam dir
##
## For deduped bams
for file in $localdata/bams/*.dedup.sorted.bam; do
    mosdepth_mpnst $file $localdata/bam-qc/dedup 250000000
done
##
#
# get simple tsv and send to repo

for file in $localdata/bam-qc/dedup/lib*.regions.bed.gz; do
    base=`basename -s .dedup.sorted.regions.bed.gz $file`
    zcat $file | awk -v FS='\t' -v var=$base 'NR <=24 {print var,$1,$4}' >> $localdata/bam-qc/dedup/all_dedup_coverage
done

header=library_id\\tchr\\tmean_coverage
sed -i "1 i$header" $localdata/bam-qc/dedup/all_dedup_coverage

## Local
>>>>>>> 2d6bf2d62424a76f5893600fce7444a867784228
source ~/repos/mpnst/bin/local-setup.sh
docker_interactive
biotools
##
## Functions
###
### Convert bams to wigs
bam_to_wig() {
    printf "Variables are: 1=bam_file 2=bam_suffix 3=outdir\n"
        base=`basename -s ${2} $1`
        if [ $3/${base}.wig -ot $1 ]; then
            /opt/hmmcopy_utils/bin/readCounter --window 1000000 --quality 20 \
                                               --chromosome "chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY" $1 > $3/${base}.wig
        fi
}
###
### Run ichor for low TF
ichor_lowfract() {
    base=`basename -s .wig $1`
    if [ $2/$base.RData -ot $1 ]; then
        Rscript /opt/ichorCNA/scripts/runIchorCNA.R \
                --id $base \
                --WIG $1 \
                --gcWig /opt/ichorCNA/inst/extdata/gc_hg19_1000kb.wig \
                --normal "c(0.95, 0.99, 0.995, 0.999)" \
                --ploidy "c(2)" \
                --maxCN 3 \
                --estimateScPrevalence FALSE \
                --scStates "c()" \
                --outDir $2
    fi
}
##
##
mkdir -p $localdata/wigs
mkdir -p $localdata/ichor
#
# Make wigs
#
#bam_to_wig /mnt/xt3/mpnst/frag-filt-bams/lib109.dedup.sorted.frag90_150.sorted.bam .dedup.sorted.frag90_150.sorted.bam $localdata/wigs
##
for file in $localdata/frag-filt-bams/lib109*.bam; do
    bam_to_wig $file \
               .dedup.sorted.frag.sorted.bam \
               $localdata/wigs
done

## For fraction-filtered WGS cfDNA
for file in $localdata/frag-filt-bams/*.bam; do
    bam_to_wig $file \
               .dedup.sorted.frag.sorted.bam \
               $localdata/wigs
done
##
## For tumor and leukocyte WGS libraries
### Make array of genomic library file paths
genomic=($(cat /drive3/users/jszymanski/repos/mpnst/data/libraries.csv | grep -e tumor -e leukocyte | grep -v "wes" | awk -F, '{print $1}' | sed 's/"//g' | sed 's/$/.dedup.sorted.bam/g' | sed 's/^/\/mnt\/xt3\/mpnst\/bams\//g'))
###
for file in ${genomic[@]}; do
    bam_to_wig $file \
               .dedup.sorted.bam \
               $localdata/wigs
done
#
##
## Send successful file list to repo
rm /drive3/users/jszymanski/repos/mpnst/data/wigs.tsv
for file in $localdata/wigs/*.wig;
do
    base=`basename -s .wig $file`
    echo $base >> /drive3/users/jszymanski/repos/mpnst/data/wigs.tsv
done
#
##RESUME HERE
# ichor
##
for file in $localdata/wigs/lib109*.wig; do
    ichor_lowfract $file $localdata/ichor
done


header=library_id\\tchr\\tmean_coverage
sed -i "1 i$header" $localdata/bam-qc/dedup/all_dedup_coverage

max_file_size=5000000
file_size=$(
    wc -c <"$localdata/bam-qc/dedup/all_dedup_coverage"
         )

if [ $filesize -gt $max_file_size ]; then
    touch $repo/data/qc/all_dedup_coverage_too_big
else
    cp $localdata/bam-qc/dedup/all_dedup_coverage $repo/qc/all_dedup_coverage.tsv
fi
#

Cant calcualte depths off ~/repos/mpnst/data/bam_qc_data/mqc_mosdepth-coverage-per-contig_1.txt , d/n allow values under 1
[ ] for coverage, should intersect down to autosomes
https://github.com/brentp/mosdepth
run and extract mosdepth mosdepthRAW = as_tibble(read.table(file.path(repo,”data/all_dedup_coverage.tsv”), header = T, sep = ‘\t’, fill = TRUE))

make consolidated per-cna file (see $datadir/old/all_cna.bed)

Bam downsampling multiple downsampling sizes

Downsample bams

Snakemake

# Downsample bam file to a set number of reads
rule cfdna_wgs_downsample:
    benchmark: benchdir + "/{library}_{milreads}_cfdna_wgs_downsample.benchmark.txt",
    container: cfdna_wgs_container,
    input: cfdna_wgs_bams + "/{library}_filt.bam",
    log: logdir + "/{library}_{milreads}_cfdna_wgs_downsample.log",
    output: cfdna_wgs_bams + "/{library}_ds{milreads}.bam",
    params:
        milreads = MILREADS,
        script = cfdna_wgs_scriptdir + "/downsample.sh",
        threads = cfdna_wgs_threads,
    shell:
        """
        {params.script} \
        {input} \
        {wildcards.milreads} \
        {output} &> {log}
        """

Shell script

#!/usr/bin/env bash

# For unit testing
# in_bam="test/analysis/cfdna_wgs_bams/lib001_filt.bam"
# out_bam=/tmp/test.bam
# milreads="0.0041"

in_bam=$1
milreads="$2"
out_bam=$3

reads=$(echo |awk -v var1=$milreads '{ print 1000000*var1 }')

## Calculate the sampling factor based on the intended number of reads:

FACTOR=$(samtools idxstats $in_bam | cut -f3 | awk -v COUNT=$reads 'BEGIN {total=0} {total += $1} END {print COUNT/total}')

if [[ $FACTOR > 1 ]]; then
    echo "DS reads exceeds total for $in_bam"
else
    sambamba view -s $FACTOR -f bam -l 5 $in_bam > $out_bam
fi

Reference

https://github.com/jeszyman/cfdna-wgs

Bioinformatics project module

Github link

Biotools headline

Old rules

Alignment processing

# Alignment deduplication and sorting
rule alignment_processing:
    input:
        config["datadir"] + "/bam/{library_id}_raw.bam",
    output:
        dedup = temp(config["datadir"] + "/bam/{library_id}_dedup_unsort.bam"),
        sort = config["datadir"] + "/bam/{library_id}_dedup.bam",
        index = config["datadir"] + "/bam/{library_id}_dedup.bam.bai",
    log:
        config["datadir"] + "/logs/alignment_processing_{library_id}.log"
    shell:
        """
        {config[cfdna_wgs_script_dir]}/alignment_processing.sh \
        {input} \
        {config[threads]} \
        {output.bam} \
        {output.dedup} \
        {output.sort} \
        {output.index} \
        &> {log}
        """

Script

#!/usr/bin/env bash

<#bash_preamble#>

input=$1
threads=$2
output_bam=$3
output_dedup=$4
output_sort=$5
output_index=$6

sambamba view -t $threads -S -f bam $input > $output_bam
sambamba markdup -r -t $threads $output_bam $output_dedup
sambamba sort -t $threads $output_dedup -o $output_sort
sambamba index -t $threads $output_sort

Files

cfdna-wgs.org

Latest commit

History

cfdna-wgs.org

File metadata and controls

Cell-free DNA Whole-Genome Sequencing

Setup inputs for testing

Dockerfile

Preamble

IchorCNA

Snakemake configuration YAML

INPROCESS Integration testing snakefile wrapper

Preamble

Variable naming

Functions, miscellaneous

All rule

Benchmark aggregation

Symlink input fastqs

Includes statements

Basic read processing

Preamble

Read and alignment processing

Make alignment index

Adapter-trim and QC reads with fastp

Align reads with BWA

Remove PCR duplicates

Filter de-duplicated alignments

Read and alignment QC

FastQC

Alignment QC

Sequencing depth metrics via Picard

deepTools fragment sizes

deepTools bamCoverage

deepTools plotCoverage

MultiQC

Make aggregate QC table

Downsample bams

Setup conditional execution of downsampled bams

Filter downsampled bams to set fragment length distributions

Fragmentomics

Preamble

Make GC and mappability restricted bins

Make bedfile from filtered bam

Make GC distributions

Make healthy GC distributions summary file

Sample fragments by healthy GC proportions

Sum fragments in genomic windows by length

Count fragments intersecting windows

Merge counts across length and library

Make a zero-centered, unit SD fragment file

Reference

smk yas

Link to Snakefile

Development

Ideas

CNA and fragmentomics integration testing

Preamble

Variable naming

Functions

All rule

Symlink input bams

Includes statements

README

Introduction

Organization

Use

Changelog

Development

Version 9

Project stable version update

Make benchmarking table

Analysis of copy number alteration

Preamble

Convert bam to wig

Run ichorCNA without a panel of normals

Hold and test

Ideas

Filter fragments by length

Reference