Skip to content

Latest commit

 

History

History
executable file
·
2819 lines (2515 loc) · 102 KB

cfdna-wgs.org

File metadata and controls

executable file
·
2819 lines (2515 loc) · 102 KB

Cell-free DNA Whole-Genome Sequencing

:header-args+: :noweb yes

Setup inputs for testing

  • test/inputs/libraries.tsv
  • test/inputs/cna_libraries.tsv
  • Get my fastq inputs for read process & testing
    #!/bin/echo For documentation, not intended to be executable:.
    mntpt=/mnt/ris/aadel/Active
    singularity shell --bind ${mntpt} ~/sing_containers/biotools.1.0.2.sif
    
    repo=/home/jeszyman/repos/cfdna-wgs
    mntpt=/mnt/ris/aadel/Active
    mkdir -p ${repo}/test/inputs
    
    fq_size=100000
    zcat ${mntpt}/mpnst/inputs/seq/MPNST/19_2_082_R1.fastq.gz | head -n $fq_size > ${repo}/test/inputs/mpnst1_R1.fastq
    zcat ${mntpt}/mpnst/inputs/seq/MPNST/19_2_082_R2.fastq.gz | head -n $fq_size > ${repo}/test/inputs/mpnst1_R2.fastq
    zcat ${mntpt}/mpnst/inputs/seq/MPNST/25_2_072_R1.fastq.gz | head -n $fq_size > ${repo}/test/inputs/mpnst2_R1.fastq
    zcat ${mntpt}/mpnst/inputs/seq/MPNST/25_2_072_R2.fastq.gz | head -n $fq_size > ${repo}/test/inputs/mpnst2_R2.fastq
    zcat ${mntpt}/mpnst/inputs/seq/PN/37_JS0050CD112717_R1.fastq.gz | head -n $fq_size > ${repo}/test/inputs/plex1_R1.fastq
    zcat ${mntpt}/mpnst/inputs/seq/PN/37_JS0050CD112717_R2.fastq.gz | head -n $fq_size > ${repo}/test/inputs/plex1_R2.fastq
    zcat ${mntpt}/mpnst/inputs/seq/PN/30_JS0044CD112818_R1.fastq.gz | head -n $fq_size > ${repo}/test/inputs/plex2_R1.fastq
    zcat ${mntpt}/mpnst/inputs/seq/PN/30_JS0044CD112818_R2.fastq.gz | head -n $fq_size > ${repo}/test/inputs/plex2_R2.fastq
    
    for file in ${repo}/test/inputs/*.fastq; do gzip -f $file; done
    
        
  • Get bams for CNA and frag testing
    mntpt=/mnt/ris/aadel/Active
    singularity shell --bind ${mntpt} ~/sing_containers/biotools.1.0.2.sif
    
    repo=/home/jeszyman/repos/cfdna-wgs
    mntpt=/mnt/ris/aadel/Active
    mkdir -p ${repo}/test/inputs
    
    # Create small bam files to store in repo. Subsample real bams to ~100 Mb.
    sambamba view -s .005 -f bam -t 36 /mnt/ris/aadel/Active/mpnst/test/bam/new_HiSeq15_L002001_ACAC_extract_ds20.bam | samtools sort -@4 -n -o test/inputs/lib003_hg38.bam -
    
    sambamba view -s .005 -f bam -t 36 /mnt/ris/aadel/Active/mpnst/test/bam/new_HiSeq15_L002001_ATCG_extract_ds20.bam > test/bam/lib004_hg38.bam
    
    sambamba view -s 0.01 -f bam -t 4 /mnt/ris/aadel/Active/mpnst/bam/cfdna_wgs/ds/lib105_ds10.bam > test/inputs/lib005.bam
    
    sambamba view -s 0.01 -f bam -t 4 /mnt/ris/aadel/Active/mpnst/bam/cfdna_wgs/ds/lib205_ds10.bam > test/inputs/lib006.bam
    
    for file in test/inputs/*.bam; do samtools index $file; done
        

Dockerfile

Preamble

FROM jeszyman/biotools:1.0.2

#################
###   Notes   ###
#################
#
# After build, the image will be pushed to the dockerhub as
# jeszyman/cfdna_wgs
# (https://hub.docker.com/repository/docker/jeszyman/cfdna_wgs)

IchorCNA

#
RUN cd /opt && \
    git clone https://github.com/shahcompbio/hmmcopy_utils.git && \
    cd hmmcopy_utils && \
    cmake . && \
    make

#
# ichorCNA
##
## linux dependencies
RUN apt-get update \
   && apt-get install -y \
   libcurl4-openssl-dev \
   libssl-dev \
   libxml2-dev
#RUN rm /usr/lib/x86_64-linux-gnu/libcurl.so.4
#RUN ln -s /usr/lib/x86_64-linux-gnu/libcurl.so.4.5.0 /usr/lib/x86_64-linux-gnu/libcurl.so.4
##
## R dependencies
RUN R -e 'install.packages("BiocManager"); BiocManager::install(); BiocManager::install("HMMcopy"); BiocManager::install("GenomeInfoDb"); BiocManager::install("GenomicRanges");'
##
## git clone install
RUN cd /opt \
    && git clone https://github.com/broadinstitute/ichorCNA.git \
    && cd ichorCNA \
    && R CMD INSTALL . \
    && cd /opt
##
###   Parameters intended to be common across workflows   ###

blklist: "test/inputs/hg38-blacklist.v2.bed"
datadir: "test"
default_container: "/home/jeszyman/sing_containers/biotools.1.0.2.sif"
genome_fasta: "test/inputs/chr8.fa"
genome_ref: "test/ref/chr8.fa"
qcdir: "test/analysis/qc"
threads: 4

###   Unique properties from this repo   ###

cfdna_wgs_container: "/home/jeszyman/sing_containers/cfdna_wgs.1.0.0.sif"

cfdna_wgs_repo: "/home/jeszyman/repos/cfdna-wgs"

downsample:
  - "0.001"
  - "0.004"

frag_distro: "90_150"

gc5mb: "test/inputs/gc5mb.bed"

picard_jar: "/opt/picard/picard.jar"

Preamble

##################################################################
###   Integration testing snakefile for WGS cfDNA Processing   ###
##################################################################

import pandas as pd
import re
import numpy as np

Variable naming

# Values directly from configuration file
DOWNSAMPLE = config["downsample"]
threads = config["threads"]
FRAG_DISTROS = config["frag_distro"]
cfdna_wgs_threads = config["threads"]
default_container = config["default_container"]
cfdna_wgs_container = config["cfdna_wgs_container"]
genome_fasta = config["genome_fasta"]
genome_ref = config["genome_ref"]
cfdna_wgs_repo = config["cfdna_wgs_repo"]

# Directory values derived from datadir in configuration YAML
datadir                   = config["datadir"]
cfdna_wgs                 = datadir + "/analysis/cfdna_wgs"
cfdna_wgs_bams            = datadir + "/analysis/cfdna_wgs/bams"
cfdna_wgs_fastqs          = datadir + "/analysis/cfdna_wgs/fastqs"
cfdna_wgs_frag            = datadir + "/analysis/cfdna_wgs/frag"
cfdna_wgs_frag_beds       = datadir + "/analysis/cfdna_wgs/frag/beds"
cfdna_wgs_frag_counts     = datadir + "/analysis/cfdna_wgs/frag/counts"
cfdna_wgs_frag_gc_distros = datadir + "/analysis/cfdna_wgs/frag/distros"
qcdir                     = datadir + "/analysis/qc"
benchdir                  = datadir + "/benchmark"
logdir                    = datadir + "/logs"
refdir                    = datadir + "/ref"

cfdna_wgs_scriptdir = config["cfdna_wgs_repo"] +  "/scripts"

Functions, miscellaneous

###   Functions   ###

# Setup sample name index as a python dictionary
cfdna_wgs_libraries = pd.read_table(config["datadir"] + "/inputs/libraries.tsv")

readable = []
for x in cfdna_wgs_libraries.file:
    readable.append(os.access(x, os.R_OK))
# Ensure readable fastqs
cfdna_wgs_libraries['readable']=readable
cfdna__wgs_libraries = cfdna_wgs_libraries[cfdna_wgs_libraries.readable == True]
# Ensure correct library type per sample sheet
cfdna_wgs_libraries = cfdna_wgs_libraries[cfdna_wgs_libraries.library_type == "wgs"]
cfdna_wgs_libraries = cfdna_wgs_libraries[cfdna_wgs_libraries.isolation_type == "cfdna"]

# Make the dictionary
cfdna_wgs_library_indict = cfdna_wgs_libraries["library"].tolist()
cfdna_wgs_file_indict = cfdna_wgs_libraries["file"].tolist()
cfdna_wgs_lib_dict = dict(zip(cfdna_wgs_library_indict, cfdna_wgs_file_indict))

CFDNA_WGS_LIBRARIES = list(cfdna_wgs_lib_dict.keys())
CFDNA_WGS_FASTQS = list(cfdna_wgs_lib_dict.values())

# Make  a list of healthy libraries
CFDNA_WGS_HEALTHY_LIBRARIES = cfdna_wgs_libraries[cfdna_wgs_libraries['cohort'] == 'healthy']['library'].tolist()

All rule

rule all:
    input:
        logdir + "/aggregate_output",
        cfdna_wgs_frag + "/ratios.tsv",
        qcdir + "/cfdna_wgs_read_qc.tsv",
        qcdir + "/cfdna_wgs_frag_len.tsv",

Benchmark aggregation

onsuccess:
    shell("""
        bash {cfdna_wgs_scriptdir}/agg_bench.sh {benchdir} {qcdir}/agg_bench.tsv
        """)
# For unit testing
indir="test/benchmark"
output="test/analysis/qc/bench_agg.tsv"

if [ -f $output ]; then rm $output; fi

for file in $indir/*
do
    base=$(basename $file)
    cat $file | awk -v OFS='\t' -v var=$base 'NR>1 {print var,$0}' >> $output
done

sed -i '1i\process\tfloat_sec\trun_time\tmax_rss\tmax_vms\tmax_uss\tmax_pss\tio_in\tio_out\tmean_load\tcpu_time' $output
library(tidyverse)

bmk_in = read_tsv("~/repos/cfdna-wgs/test/analysis/qc/bench_agg.tsv")

bmk =
  bmk_in %>%
  mutate(process = gsub(".benchmark.txt", "", process)) %>%
  mutate(library = process) %>%
  mutate(library = ifelse(grepl("lib[0-9]{3}_", process),
                          sub("^.*lib(\\d{3}).*$", "lib\\1", process), "all_libs")) %>%
  mutate(process2 = process) %>%
  mutate(process = gsub("^lib..._","", process)) %>%
  rename(process_lib = process2)

find_outlier <- function(x) {
  return(x > quantile(x, .75) + 1.5*IQR(x))
}

bmk %>% mutate(outlier = ifelse(find_outlier(run_time), process_lib, NA)) %>%
  ggplot(.,aes(y=run_time)) +
  geom_boxplot() +
  geom_text(aes( y = run_time, x = .1,label=outlier), na.rm=TRUE, position = position_jitter())

bmk %>% mutate(outlier = ifelse(find_outlier(run_time), process_lib, NA)) %>%
  ggplot(.,aes( y = run_time)) +
  geom_boxplot() +
  geom_text(aes( y = run_time, x = .1,label=outlier), na.rm=TRUE, position = position_jitter())

Symlink input fastqs

rule symlink_inputs:
    container: default_container,
    input:
        lambda wildcards: cfdna_wgs_lib_dict[wildcards.library],
    output:
        read1 = cfdna_wgs_fastqs + "/{library}_raw_R1.fastq.gz",
        read2 = cfdna_wgs_fastqs + "/{library}_raw_R2.fastq.gz",
    params:
        outdir = cfdna_wgs_fastqs,
        script = cfdna_wgs_scriptdir + "/symlink.sh",
    shell:
        """
        {params.script} \
        {input} \
        {output.read1} \
        {output.read2} \
        {params.outdir}
        """
#!/usr/bin/env bash
set -o errexit   # abort on nonzero exitstatus
set -o nounset   # abort on unbound variable
set -o pipefail  # don't hide errors within pipes

# Script variables
input_read1="${1}"
output_read1="${2}"
output_read2="${3}"
outdir="${4}"

mkdir -p $outdir

input_read2="$(echo $input_read1 | sed "s/_R1/_R2/g")"

ln -sf --relative ${input_read1} ${output_read1}
ln -sf --relative ${input_read2} ${output_read2}

Includes statements

include: cfdna_wgs_repo + "/workflow/reads.smk"
include: cfdna_wgs_repo + "/workflow/frag.smk"

Basic read processing

Preamble

#########1#########2#########3#########4#########5#########6#########7#########8
#                                                                              #
#                    Basic Read and Alignment Processing of                    #
#                    Cell-free DNA Whole Genome Sequencing                     #
#                                                                              #
#########1#########2#########3#########4#########5#########6#########7#########8

Read and alignment processing

Make alignment index

  • Snakemake
    # Make alignment index
    #  Note: Upon first run, this rule will touch an empty file with the same path
    #        as the index prefix. Thereafter, you can avoid repeat indexing when the
    #        rule "sees" this empty file. For repo intergration testing with an
    #        external reference, indexing can likewise be avoided with this empty
    #        file at the external index location.
    
    rule cfdna_wgs_index:
        benchmark: benchdir + "/cfdna_wgs_index.benchmark.txt",
        container: cfdna_wgs_container,
        input: genome_fasta,
        log: logdir + "/cfdna_wgs_index.log",
        output: done = touch(genome_ref),
        params:
            out_prefix = genome_ref,
            script = cfdna_wgs_scriptdir + "/index.sh",
            threads = cfdna_wgs_threads,
        shell:
            """
            bwa index -p {params.out_prefix} {input} &> {log}
            """
    
        

Adapter-trim and QC reads with fastp

  • Snakemake
    # Adapter-trim and QC reads with fastp
    rule cfdna_wgs_fastp:
        benchmark: benchdir + "/{library}_cfdna_wgs_fastp.benchmark.txt",
        container: cfdna_wgs_container,
        input:
            read1 = cfdna_wgs_fastqs + "/{library}_raw_R1.fastq.gz",
            read2 = cfdna_wgs_fastqs + "/{library}_raw_R2.fastq.gz",
        log:
            cmd = logdir + "/{library}_cfdna_wgs_fastp.log",
            html = logdir + "/{library}_cfdna_wgs_fastp.html",
            json = logdir + "/{library}_cfdna_wgs_fastp.json",
        output:
            read1 = cfdna_wgs_fastqs + "/{library}_processed_R1.fastq.gz",
            read2 = cfdna_wgs_fastqs + "/{library}_processed_R2.fastq.gz",
            failed = cfdna_wgs_fastqs + "/{library}_failed_fastp.fastq.gz",
            unpaired1 = cfdna_wgs_fastqs + "/{library}_unpaired_R1.fastq.gz",
            unpaired2 = cfdna_wgs_fastqs + "/{library}_unpaired_R2.fastq.gz",
        params:
            script = cfdna_wgs_scriptdir + "/fastp.sh",
            threads = cfdna_wgs_threads,
        resources:
            mem_mb = 500,
        shell:
            """
            {params.script} \
            {input.read1} \
            {input.read2} \
            {log.html} \
            {log.json} \
            {output.read1} \
            {output.read2} \
            {output.failed} \
            {output.unpaired1} \
            {output.unpaired2} \
            {params.threads} &> {log.cmd}
            """
        
  • Shell script
    #!/usr/bin/env bash
    set -o errexit   # abort on nonzero exitstatus
    set -o nounset   # abort on unbound variable
    set -o pipefail  # don't hide errors within pipes
    
    # Script variables
    
    input_read1="${1}"
    input_read2="${2}"
    log_html="${3}"
    log_json="${4}"
    output_read1="${5}"
    output_read2="${6}"
    output_failed="${7}"
    output_unpaired1="${8}"
    output_unpaired2="${9}"
    params_threads="${10}"
    
    # Functions
    main(){
        fastp_wrap $output_failed \
                   $input_read1 \
                   $input_read2 \
                   $log_html \
                   $log_json \
                   $output_read1 \
                   $output_read2 \
                   $output_unpaired1 \
                   $output_unpaired2 \
                   $params_threads
    }
    
    fastp_wrap(){
        fastp --detect_adapter_for_pe \
              --failed_out $output_failed \
              --in1 $input_read1 \
              --in2 $input_read2 \
              --html $log_html \
              --json $log_json \
              --out1 $output_read1 \
              --out2 $output_read2 \
              --unpaired1 $output_unpaired1 \
              --unpaired2 $output_unpaired2 \
              --thread $params_threads
        }
    
    # Run
    main "$@"
        

Align reads with BWA

  • Snakemake
    # Align reads with BWA
    rule cfdna_wgs_align:
        benchmark: benchdir + "/{library}_cfdna_wgs_align.benchmark.txt",
        container: cfdna_wgs_container,
        input:
            ref = genome_ref,
            read1 = cfdna_wgs_fastqs + "/{library}_processed_R1.fastq.gz",
            read2 = cfdna_wgs_fastqs + "/{library}_processed_R2.fastq.gz",
        log: logdir + "/{library}_cfdna_wgs_align.log",
        output:
            sort = cfdna_wgs_bams + "/{library}_raw.bam",
            index = cfdna_wgs_bams + "/{library}_raw.bam.bai",
        params:
            script = cfdna_wgs_scriptdir + "/align.sh",
            threads = 4,
        resources:
            mem_mb = 500,
        shell:
            """
            {params.script} \
            {input.ref} \
            {input.read1} \
            {input.read2} \
            {params.threads} \
            {output.sort} &> {log}
            """
        
  • Shell script
    #!/usr/bin/env bash
    input_ref=$1
    input_r1=$2
    input_r2=$3
    threads=$4
    output_sort=$5
    
    bwa mem -M -t $threads \
        $input_ref \
        $input_r1 \
        $input_r2 |
        samtools view -@ $threads -Sb - -o - |
        samtools sort -@ $threads - -o $output_sort
    samtools index -@ threads $output_sort
        

Remove PCR duplicates

  • Snakemake
    # Remove PCR duplicates from aligned reads
    rule cfdna_wgs_dedup:
        benchmark: benchdir + "/{library}_cfdna_wgs_dedup.benchmark.txt",
        container: cfdna_wgs_container,
        input: cfdna_wgs_bams + "/{library}_raw.bam",
        log: logdir + "/{library}_cfdna_wgs_dedup.log",
        output: cfdna_wgs_bams + "/{library}_dedup.bam",
        params:
            script = cfdna_wgs_scriptdir + "/dedup.sh",
            threads = cfdna_wgs_threads,
        shell:
            """
            {params.script} \
            {input} \
            {output} \
            {params.threads} &> {log}
            """
        
  • Shell script
    #!/usr/bin/env bash
    set -o errexit   # abort on nonzero exitstatus
    set -o nounset   # abort on unbound variable
    set -o pipefail  # don't hide errors within pipes
    
    # Script variables
    raw_bam="${1}"
    dedup_bam="${2}"
    threads="${3}"
    
    samtools sort -@ $threads -n -o - $raw_bam |
        samtools fixmate -m - - |
        samtools sort -@ $threads -o - - |
        samtools markdup -@ $threads -r - $dedup_bam
    samtools index $dedup_bam
        

Filter de-duplicated alignments

  • Snakemake
    # Filter de-duplicated alignments.
    # Remove unmapped, not primary, and duplicate reads. Additional location filter by config bedfile variable.
    
    checkpoint cfdna_wgs_filter_alignment:
        benchmark: benchdir + "/{library}_cfdna_wgs_filter_alignment.benchmark.txt",
        container: cfdna_wgs_container,
        input: cfdna_wgs_bams + "/{library}_dedup.bam",
        log: logdir + "/{library}_cfdna_wgs_filter_alignment.log",
        output: cfdna_wgs_bams + "/{library}_filt.bam",
        params:
            script = cfdna_wgs_scriptdir + "/filter_alignment.sh",
            threads = cfdna_wgs_threads,
        shell:
            """
            {params.script} \
            {input} \
            {params.threads} \
            {output} &> {log}
            """
        
  • Shell script
    #!/usr/bin/env bash
    
    input=$1
    threads=$2
    output=$3
    
    # Filter to reads that are
    #  - Excluding any unmapped, not primary alignment, or duplicates
    #  - Only MAPQ > 20
    # DO NOT restrict to "proper pairs"- this clips long cfDNA fragments!
    
    samtools view -@ $threads -b -F 1284 -h -q 20 -o $output $input
    
    samtools index ${output}
        

Read and alignment QC

FastQC

  • Snakemake
    # Get read quality by FASTQC
    rule cfdna_wgs_fastqc:
        benchmark: benchdir+ "/{library}_{processing}_{read}_cfdna_wgs_fastqc.benchmark.txt",
        container: cfdna_wgs_container,
        input: cfdna_wgs_fastqs + "/{library}_{processing}_{read}.fastq.gz",
        log: logdir + "/{library}_{processing}_{read}_cfdna_wgs_fastqc.log",
        output:
            qcdir + "/{library}_{processing}_{read}_fastqc.html",
            qcdir + "/{library}_{processing}_{read}_fastqc.zip",
        params:
            outdir = qcdir,
            script = cfdna_wgs_scriptdir + "/fastqc.sh",
            threads = cfdna_wgs_threads,
        shell:
            """
            {params.script} \
            {input} \
            {params.outdir} \
            {params.threads} &> {log}
            """
        
  • Shell script
    #!/usr/bin/env bash
    set -o errexit   # abort on nonzero exitstatus
    set -o nounset   # abort on unbound variable
    set -o pipefail  # don't hide errors within pipes
    
    # Script variables
    input="${1}"
    outdir="${2}"
    threads="${3}"
    
    # Functions
    fastqc  --outdir $outdir \
            --quiet \
            --threads $threads $input
        

Alignment QC

  • Snakemake
    # Get alignment QC using samtools
    rule cfdna_wgs_alignment_qc:
        container: cfdna_wgs_container,
        input: cfdna_wgs_bams + "/{library}_{processing}.bam",
        log:
            flagstat = logdir + "/{library}_{processing}_flagstat_cfdna_wgs_alignment_qc.log",
            samstat = logdir + "/{library}_{processing}_samstats_cfdna_wgs_alignment_qc.log",
        output:
            flagstat = qcdir + "/{library}_{processing}_flagstat.txt",
            samstat = qcdir + "/{library}_{processing}_samstats.txt",
        params:
            script = cfdna_wgs_scriptdir + "/alignment_qc.sh",
            threads = cfdna_wgs_threads,
        shell:
            """
            {params.script} \
            {input} \
            {log.flagstat} \
            {log.samstat} \
            {output.flagstat} \
            {output.samstat} \
            {params.threads}
            """
        
  • Shell script
    #!/usr/bin/env bash
    set -o errexit   # abort on nonzero exitstatus
    set -o nounset   # abort on unbound variable
    set -o pipefail  # don't hide errors within pipes
    
    # Script variables
    input="${1}"
    log_flagstat="${2}"
    log_samstat="${3}"
    output_flagstat="${4}"
    output_samstat="${5}"
    threads="${6}"
    
    # Functions
    main(){
        flagstat $input $output_flagstat $log_flagstat $threads
        samstats $input $output_samstat $log_samstat $threads
    }
    
    flagstat(){
        local input="${1}"
        local output="${2}"
        local log="${3}"
        local threads="${4}"
        #
        samtools flagstat -@ $threads $input > $output 2>$log
    }
    
    samstats(){
        local input="${1}"
        local output="${2}"
        local log="${3}"
        local threads="${4}"
        #
        samtools stats -@ $threads $input > $output 2>$log
    }
    
    # Run
    main "$@"
        

Sequencing depth metrics via Picard

  • Snakemake
    # Sequencing depth metrics via Picard
    rule cfdna_wgs_picard_depth:
        benchmark: benchdir + "/{library}_cfdna_wgs_picard_depth.benchmark.txt",
        container: cfdna_wgs_container,
        input: cfdna_wgs_bams + "/{library}_filt.bam",
        log: logdir + "/{library}_cfdna_wgs_picard_depth.log",
        output: qcdir + "/{library}_picard_depth.txt",
        params:
            script = cfdna_wgs_scriptdir + "/picard_depth.sh",
            threads = cfdna_wgs_threads,
        shell:
            """
            {params.script} \
            {input} \
            {config[picard_jar]} \
            {config[genome_fasta]} \
            {output}
            """
        
  • Shell script
    #!/usr/bin/env bash
    input=$1
    picard_jar=$2
    genome_fasta=$3
    output=$4
    
    java -jar $picard_jar CollectWgsMetrics \
           INPUT=$input \
           OUTPUT=$output \
           READ_LENGTH=150 \
           REFERENCE_SEQUENCE=$genome_fasta
        

deepTools fragment sizes

  • Snakemake
    # Get fragment sizes using deepTools
    rule cfdna_wgs_bampefragsize:
        benchmark: benchdir + "/cfdna_wgs_bampefragsize.benchmark.txt",
        container: cfdna_wgs_container,
        input: expand(cfdna_wgs_bams + "/{library}_filt.bam", library = CFDNA_WGS_LIBRARIES),
        log: logdir + "/cfdna_wgs_bampefragsize.log",
        output:
            raw = qcdir + "/deeptools_frag_lengths.txt",
            hist = qcdir + "/deeptools_frag_lengths.png",
        params:
            blacklist = config["blklist"],
            script = cfdna_wgs_scriptdir + "/bampefragsize.sh",
            threads = cfdna_wgs_threads,
        shell:
            """
            {params.script} \
            "{input}" \
            {log} \
            {output.hist} \
            {output.raw} \
            {params.blacklist} \
            {params.threads}
            """
        
  • Shell script
    #!/usr/bin/env bash
    #!/usr/bin/env bash
    set -o errexit   # abort on nonzero exitstatus
    set -o nounset   # abort on unbound variable
    set -o pipefail  # don't hide errors within pipes
    
    # Script variables
    
    input="${1}"
    log="${2}"
    output_hist="${3}"
    output_raw="${4}"
    blacklist="${5}"
    threads="${6}"
    
    
    bamPEFragmentSize --bamfiles $input \
                      --numberOfProcessors $threads \
                      --blackListFileName $blacklist \
                      --histogram $output_hist \
                      --maxFragmentLength 1000 \
                      --outRawFragmentLengths $output_raw
        

deepTools bamCoverage

  • Snakemake
    # Make deeptools bamCoverage bedfile
    rule cfdna_wgs_bamcoverage:
        benchmark: benchdir + "/{library}_cfdna_wgs_bamcoverage.benchmark.txt",
        container: cfdna_wgs_container,
        input: cfdna_wgs_bams + "/{library}_filt.bam",
        log: logdir + "/{library}_cfdna_wgs_bamcoverage.log",
        output: qcdir + "/{library}_bamcoverage.bg",
        params:
            bin = "10000",
            blacklist = config["blklist"],
            script = cfdna_wgs_scriptdir + "/bamcoverage.sh",
            threads = cfdna_wgs_threads,
        shell:
            """
            {params.script} \
            {input} \
            {output} \
            {params.bin} \
            {params.blacklist} \
            {params.threads} &> {log}
            """
        
  • Shell script
    #!/usr/bin/env bash
    
    in_bam=$1
    bin=$3
    blacklist=$4
    threads=$5
    out_bg=$2
    
    bamCoverage \
        --bam $in_bam \
        --binSize $bin \
        --blackListFileName $blacklist \
        --effectiveGenomeSize 2913022398 \
        --extendReads \
        --ignoreDuplicates \
        --ignoreForNormalization chrX \
        --normalizeUsing RPGC \
        --numberOfProcessors $threads \
        --outFileFormat bedgraph \
        --outFileName $out_bg
        

deepTools plotCoverage

  • Snakemake
    # Make deepTools plotCoverage coverage maps for all filtered bams
    rule cfdna_wgs_plotcoverage:
        benchmark: benchdir + "/cfdna_wgs_plotcoverage.benchmark.txt",
        container: cfdna_wgs_container,
        input: expand(cfdna_wgs_bams + "/{library}_filt.bam", library = CFDNA_WGS_LIBRARIES),
        log: logdir + "/cfdna_wgs_plotcoverage.log",
        output:
            raw = qcdir + "/cfdna_wgs_coverage.tsv",
            plot = qcdir + "/cfdna_wgs_coverage.pdf",
        params:
            blacklist = config["blklist"],
            script = cfdna_wgs_scriptdir + "/plotcoverage.sh",
            threads = cfdna_wgs_threads,
        shell:
            """
            {params.script} \
            "{input}" \
            {params.blacklist} \
            {params.threads} \
            {output.raw} \
            {output.plot} &> {log}
            """
        
  • Shell script
    #!/usr/bin/env bash
    in_bam_string=$1
    blacklist=$2
    threads=$3
    out_raw=$4
    out_plot=$5
    
    plotCoverage \
        --bamfiles $in_bam_string \
        --blackListFileName $blacklist \
        --extendReads \
        --numberOfProcessors $threads \
        --outRawCounts $out_raw \
        --plotFile $out_plot \
        --plotFileFormat pdf \
        --skipZeros
        

MultiQC

  • Snakemake
    # Aggregate QC files using MultiQC
    rule cfdna_wgs_multiqc:
        benchmark: benchdir + "/cfdna_wgs_multiqc.benchmark.txt",
        container: cfdna_wgs_container,
        input:
            expand(logdir + "/{library}_cfdna_wgs_fastp.json", library = CFDNA_WGS_LIBRARIES),
            expand(qcdir + "/{library}_{processing}_{read}_fastqc.zip", library = CFDNA_WGS_LIBRARIES, processing = ["raw", "processed", "unpaired"], read = ["R1","R2"]),
            expand(qcdir + "/{library}_{processing}_samstats.txt", library = CFDNA_WGS_LIBRARIES, processing = ["raw","filt"]),
            expand(qcdir + "/{library}_{processing}_flagstat.txt", library = CFDNA_WGS_LIBRARIES, processing = ["raw","filt"]),
            expand(qcdir + "/{library}_picard_depth.txt", library = CFDNA_WGS_LIBRARIES),
            qcdir + "/deeptools_frag_lengths.txt",
            qcdir + "/cfdna_wgs_coverage.tsv",
        log: logdir + "/cfdna_wgs_multiqc.log",
        output:
            qcdir + "/cfdna_wgs_multiqc.html",
            qcdir + "/cfdna_wgs_multiqc_data/multiqc_fastqc.txt",
            qcdir + "/cfdna_wgs_multiqc_data/multiqc_samtools_stats.txt",
            qcdir + "/cfdna_wgs_multiqc_data/multiqc_picard_wgsmetrics.txt",
            qcdir + "/cfdna_wgs_multiqc_data/multiqc_samtools_flagstat.txt",
        params:
            out_dir = qcdir,
            out_name = "cfdna_wgs_multiqc",
            script = cfdna_wgs_scriptdir + "/multiqc.sh",
            threads = cfdna_wgs_threads,
        shell:
            """
            {params.script} \
            "{input}" \
            {params.out_name} \
            {params.out_dir} &> {log}
            """
        
  • Shell script
    #!/usr/bin/env bash
    set -o errexit   # abort on nonzero exitstatus
    set -o nounset   # abort on unbound variable
    set -o pipefail  # don't hide errors within pipes
    
    # Script variables
    
       input="${1}"
    out_name="${2}"
     out_dir="${3}"
    
    # Functions
    
    multiqc $input \
            --force \
            --outdir $out_dir \
            --filename $out_name
        

Make aggregate QC table

  • Snakemake
    # Make a tab-separated aggregate QC table
    checkpoint cfdna_wgs_make_qc_tsv:
        benchmark: benchdir + "/cfdna_wgs_make_qc_tsv.benchmark.txt",
        container: cfdna_wgs_container,
        input:
            fq = qcdir + "/cfdna_wgs_multiqc_data/multiqc_fastqc.txt",
            mqsam = qcdir + "/cfdna_wgs_multiqc_data/multiqc_samtools_stats.txt",
            mqflag = qcdir + "/cfdna_wgs_multiqc_data/multiqc_samtools_flagstat.txt",
            picard = qcdir + "/cfdna_wgs_multiqc_data/multiqc_picard_wgsmetrics.txt",
            deeptools_frag = qcdir + "/deeptools_frag_lengths.txt",
            deeptools_cov = qcdir + "/cfdna_wgs_coverage.tsv",
        log: logdir + "/cfdna_wgs_make_qc_tsv.log",
        output:
            readqc = qcdir + "/cfdna_wgs_read_qc.tsv",
            fraglen = qcdir + "/cfdna_wgs_frag_len.tsv",
        params:
            script = cfdna_wgs_scriptdir + "/make_qc_tsv.R",
        shell:
            """
            Rscript {params.script} \
            {input.fq} \
            {input.mqsam} \
            {input.mqflag} \
            {input.picard} \
            {input.deeptools_frag} \
            {input.deeptools_cov} \
            {output.readqc} \
            {output.fraglen} >& {log}
            """
        
  • Rscript
    #!/usr/bin/env Rscript
    #
    # Unit test variables
    ## mqc_dir="test/analysis/qc/cfdna_wgs_multiqc_data"
    ## fastqc_input = paste0(mqc_dir,"/multiqc_fastqc.txt")
    ## samstats_input = paste0(mqc_dir, "/multiqc_samtools_stats.txt")
    ## flagstats_input = paste0(mqc_dir, "/multiqc_samtools_flagstat.txt")
    ## picard_input = paste0(mqc_dir, "/multiqc_picard_wgsmetrics.txt")
    ## deeptools_frag_input = "test/analysis/qc/deeptools_frag_lengths.txt"
    ## deeptools_cov_input = "test/analysis/qc/cfdna_wgs_coverage.tsv"
    
    args = commandArgs(trailingOnly = TRUE)
    fastqc_input = args[1]
    samstats_input = args[2]
    flagstats_input = args[3]
    picard_input = args[4]
    deeptools_frag_input = args[5]
    deeptools_cov_input = args[6]
    readqc_out_tbl = args[7]
    frag_len_out_tbl = args[8]
    
    library(tidyverse)
    
    process_multiqc_fastqc = function(multiqc_fastqc_input){
      as_tibble(read.table(multiqc_fastqc_input, header = TRUE, sep = '\t', stringsAsFactors = FALSE)) %>%
      mutate(library = substr(Filename,1,6)) %>%
      mutate(read = ifelse(grepl("R1", Filename), "read1", "read2")) %>%
      mutate(fastq_processing = gsub("_.*$","",substr(Sample, 8, length(Sample)))) %>%
      select(!c(Sample,File.type,Encoding)) %>%
      pivot_wider(
        names_from = c(read,fastq_processing),
        values_from = !c(library,read,fastq_processing))
    }
    
    fastqc = process_multiqc_fastqc(fastqc_input)
      as_tibble(read.table(fastqc_input, header = TRUE, sep = '\t', stringsAsFactors = FALSE)) %>%
      mutate(library = substr(Sample, 1, 6)) %>%
      mutate(bam_processing = gsub("_.*$","",substr(Sample, 8, length(Sample)))) %>%
      select(!c(Sample)) %>%
      pivot_wider(
        names_from = c(bam_processing),
        values_from = !c(library, bam_processing))
    
    process_multiqc_samfile = function(multiqc_samfile){
      read_tsv(multiqc_samfile) %>% mutate(library = substr(Sample, 1, 6)) %>%
      mutate(bam_processing = gsub("_.*$","",gsub("lib..._","", Sample))) %>%
      select(!c(Sample)) %>%
      pivot_wider(
        names_from = c(bam_processing),
        values_from = !c(library, bam_processing))
    }
    
    samstats = process_multiqc_samfile(samstats_input)
    flagstats = process_multiqc_samfile(flagstats_input)
    
    deeptools_frag = read_tsv(deeptools_frag_input, col_names = c("frag_len","frag_count","file"), skip = 1) %>%
      filter(frag_len < 500) %>%
      mutate(library = substr(gsub("^.*lib", "lib", file), 1,6)) %>%
      mutate(frag_len = sub("^", "frag_len", frag_len)) %>%
      select(library, frag_len, frag_count) %>%
      pivot_wider(
        names_from = frag_len,
        values_from = frag_count)
    
    picard = as_tibble(read.table(picard_input, header = TRUE, sep = '\t', stringsAsFactors = FALSE)) %>%
      mutate(library = Sample)
    
    deeptools_cov = read_tsv(deeptools_cov_input, skip = 1) %>%
      pivot_longer(!c(`#'chr'`, `'start'`,`'end'`), names_to = "file", values_to = "cnt") %>%
      rename(chr = `#'chr'`,
             start = `'start'`,
             end = `'end'`) %>%
      mutate(library = substr(file, 2, 7)) %>%
      group_by(library) %>%
      summarise(
        mean_cov = mean(cnt),
        median_cov = median(cnt),
                )
    
    readqc = fastqc %>%
      left_join(samstats, by = "library") %>%
      left_join(flagstats, by = "library") %>%
      left_join(deeptools_frag, by = "library") %>%
      left_join(picard, by = "library") %>%
      left_join(deeptools_cov, by = "library")
    
    write.table(readqc, file = readqc_out_tbl, row.names = F, sep = '\t', quote = F)
    
    all_frag_len = data.frame(frag_len = 1:500)
    
    frag_len =
      readqc %>% select(starts_with("frag_len") | matches("library")) %>%
      pivot_longer(!library, names_to = "frag_len", values_to = "count") %>%
      mutate(frag_len = as.numeric(gsub("frag_len","",frag_len))) %>%
      mutate(count = as.numeric(count)) %>%
      pivot_wider(names_from = library, values_from = count) %>%
      right_join(all_frag_len) %>% arrange(frag_len) %>%
      replace(is.na(.), 0)
    
    write_tsv(frag_len, file = frag_len_out_tbl)
    
        

Downsample bams

  • Snakemake
    rule downsample_bams:
        container: cfdna_wgs_container,
        input: cfdna_wgs_bams + "/{library}_filt.bam",
        output: touch(logdir + "/{library}_{downsample}_downsample.done"),
        params:
            out_dir = cfdna_wgs_bams,
            script = cfdna_wgs_scriptdir + "/downsample_bams.sh",
            suffix = "_filt.bam",
            threads = cfdna_wgs_threads,
        shell:
            """
            {params.script} \
            {input} \
            {wildcards.downsample} \
            {params.out_dir} \
            {params.suffix} \
            {params.threads}
            """
        
  • Shell script
    #!/usr/bin/env bash
    # For unit testing
    # in_bam=test/analysis/cfdna_wgs/bams/lib001_filt.bam
    # milreads=0.001
    # checker=test/tmp/lib001_ds0.001.txt
    # outdir=test/analysis/cfdna_wgs/bams
    # suffix=_filt.bam
    # threads=4
    
    in_bam="${1}"
    milreads="${2}"
    outdir="${3}"
    suffix="${4}"
    threads="${5}"
    
    downsample(){
        # Derived variables
        milreads_full=$(awk -v milreads="${milreads}" 'BEGIN{milreads_full=(1000000*milreads); print milreads_full}')
        factor=$(samtools idxstats $in_bam |
                     cut -f3 |
                     awk -v count=$milreads_full 'BEGIN {total=0} {total += $1} END {print count/total}')
        base=$(basename -s $suffix $in_bam)
        out_bam=${outdir}/${base}_ds${milreads}.bam
        #
        # Downsample
        if [[ $factor < 1 ]]; then
        samtools view -s $factor -b -@ $threads $in_bam > $out_bam
        fi
    }
    
    downsample $in_bam $milreads $suffix
        

Setup conditional execution of downsampled bams

# If downsample occured, then write filename into this per-library log, else leave the log file blank
rule log_dowsample:
    input: logdir + "/{library}_{downsample}_downsample.done",
    output: logdir + "/{library}_{downsample}_made",
    params:
        bamdir = cfdna_wgs_bams,
    shell:
        """
        dspath={params.bamdir}/{wildcards.library}_ds{wildcards.downsample}.bam
        if [ -f $dspath ]; then echo "$dspath"  > {output}; else touch {output}; fi
        """

# Use the downsampled bam logs to make a single text file of conditionally executed final targets.
# Specifically in this example, log text lines are in the form
# cfdna_wgs_bams + "/{library}_ds{downsample}_frag90_150.bam" to setup conditional execution of fragment filtering ONLY on downsampled bams
# Note alternative delimiter "~" to sed allows cfdna_wgs_wigs as param

checkpoint ds_cond_target_list:
    input: expand(logdir + "/{library}_{downsample}_made", library = CFDNA_WGS_LIBRARIES, downsample = DOWNSAMPLE),
    output: logdir + "/ds_final_targets",
    params:
        outdir = cfdna_wgs_bams,
        frag_distro=config["frag_distro"]
    shell:
        """
        if [ -f {output} ]; then rm {output}; fi
        cat {input} > {output}
        sed -i 's~^.*lib~{params.outdir}/lib~g' {output}
        sed -i 's/.bam$/_frag{params.frag_distro}.bam/g' {output}
        """

# Function jsut pulls the final target names out of ds_final_targets
def get_ds_targets(wildcards):
    with open(checkpoints.ds_cond_target_list.get(**wildcards).output[0], "r") as f:
      non_empty_files = [l.strip() for l in f.readlines()]
    return non_empty_files

# This rule allows execution of rules which will generate the conditional targets in ds_cond_target_list
rule make_ds_targets:
    input:
        get_ds_targets
    output: logdir + "/aggregate_output"
    run:
        with open(output[0], "w") as f:
            f.write("\n".join(input))

Filter downsampled bams to set fragment length distributions

rule frag_filt:
    container: cfdna_wgs_container,
    input:
        main = cfdna_wgs_bams + "/{library}_ds{downsample}.bam",
        check = logdir + "/{library}_{downsample}_made",
    output:
        nohead = temp(cfdna_wgs_bams + "/{library}_ds{downsample}_frag{frag_distro}.nohead"),
        onlyhead = temp(cfdna_wgs_bams + "/{library}_ds{downsample}_frag{frag_distro}.only"),
        final = cfdna_wgs_bams + "/{library}_ds{downsample}_frag{frag_distro}.bam",
    params:
        script = cfdna_wgs_scriptdir + "/frag_filt.sh",
        threads = cfdna_wgs_threads,
    shell:
        """
        frag_min=$(echo {wildcards.frag_distro} | sed -e "s/_.*$//g")
        frag_max=$(echo {wildcards.frag_distro} | sed -e "s/^.*_//g")
        {params.script} \
        {input.main} \
        {output.nohead} \
        $frag_min \
        $frag_max \
        {config[threads]} \
        {output.onlyhead} \
        {output.final}
        """
  • Shell script
    #!/usr/bin/env bash
    
    # Steps
    ## Filter by absolute value of TLEN for each read
    sambamba view -t $5 $1 | awk -F'\t' -v upper="$4" 'sqrt($9*$9) < upper {print $0}' | awk -F'\t' -v lower="$3" 'sqrt($9*$9) > lower {print $0}'> $2
    
    ## Restore header
    sambamba view -H $1 > $6
    
    cat $6 $2 | sambamba view -t 4 -S -f bam /dev/stdin | sambamba sort -t 4 -o $7 /dev/stdin
    
    
        

Fragmentomics

Preamble

#########1#########2#########3#########4#########5#########6#########7#########8
#                                                                              #
#     Fragmentomic Analysis of Cell-free DNA Whole Genome Sequencing           #
#                                                                              #
#########1#########2#########3#########4#########5#########6#########7#########8

Make GC and mappability restricted bins

  • Snakemake
    rule make_gc_map_bind:
        container: cfdna_wgs_container,
        input:
            gc5mb = config["gc5mb"],
            blklist = config["blklist"],
        log: logdir + "/make_gc_map_bind.log",
        output: refdir + "/keep_5mb.bed",
        params:
            script = cfdna_wgs_scriptdir + "/make_gc_map_bind.sh",
        shell:
            """
            {params.script} \
            {input.gc5mb} \
            {input.blklist} \
            {output} &> {log}
            """
        
  • Shell script
    gc5mb="${1}"
    blklist="${2}"
    keep="${3}"
    
    bedtools intersect -a $gc5mb -b $blklist -v -wa |
        grep -v _ |
        awk '{ if ($4 >= 0.3) print $0 }' > $keep
        

Make bedfile from filtered bam

  • error may be multimappers https://www.biostars.org/p/55149/
  • Snakemake
    # Make a bed file from filtered bam
    rule filt_bam_to_frag_bed:
        benchmark: benchdir + "/{library}_filt_bam_to_frag_bed.benchmark.txt",
        container: cfdna_wgs_container,
        input: cfdna_wgs_bams + "/{library}_filt.bam",
        log: logdir + "/{library}_filt_bam_to_frag_bed.log",
        output: cfdna_wgs_frag_beds + "/{library}_filt.bed",
        params:
            fasta = genome_fasta,
            script = cfdna_wgs_scriptdir + "/filt_bam_to_frag_bed.sh",
            threads = cfdna_wgs_threads,
        shell:
            """
            {params.script} \
    	{input} \
            {params.fasta} \
            {params.threads} \
            {output}
            """
        
  • Shell script
    #!/usr/bin/env bash
    
    # Snakemake variables
    input_bam="$1"
    params_fasta="$2"
    threads="${3}"
    output_frag_bed="$4"
    
    # Function
    bam_to_frag(){
        # Ensure name-sorted bam file
        samtools sort -@ $threads -n -o - $1 |
        samtools fixmate -@ $threads -m -r - - |
        # Make bedpe
        bedtools bamtobed -bedpe -i - |
        # Filter any potential non-standard alignments
        awk '$1==$4 {print $0}' | awk '$2 < $6 {print $0}' |
        # Create full-fragment bed file
        awk -v OFS='\t' '{print $1,$2,$6}' |
        # Annotate with GC content and fragment length
        bedtools nuc -fi $2 -bed stdin |
        # Convert back to standard bed with additional columns
        awk -v OFS='\t' '{print $1,$2,$3,$5,$12}' |
        sed '1d' > $3
    }
    
    # Run command
    bam_to_frag $input_bam \
                $params_fasta \
                $output_frag_bed
    
        

Make GC distributions

  • Snakemake
    # Make GC distributions
    rule gc_distro:
        benchmark: benchdir + "/{library}_cfdna_wgs_gc_distro.benchmark.txt",
        container: cfdna_wgs_container,
        input: cfdna_wgs_frag_beds + "/{library}_filt.bed",
        log: logdir + "/{library}_cfdna_wgs_gc_distro.log",
        output: cfdna_wgs_frag_gc_distros + "/{library}_gc_distro.csv",
        params:
            script = cfdna_wgs_scriptdir + "/gc_distro.R",
        shell:
            """
            Rscript {params.script} \
            {input} \
            {output} \
            > {log} 2>&1
            """
        
  • Rscript
    #!/usr/bin/env Rscript
    args = commandArgs(trailingOnly = TRUE)
    bed_file = args[1]
    distro_file = args[2]
    
    library(tidyverse)
    
    # Read in modified bed
    bed = read.table(bed_file, sep = '\t')
    names(bed) = c("chr","start","end","gc_raw","len")
    
    # Generate distribution csv
    distro =
      bed %>%
      # Round GC
      mutate(gc_strata = round(gc_raw, 2)) %>%
      # Count frags per strata
      count(gc_strata) %>%
      # Get fraction frags
      mutate(fract_frags = n/sum(n)) %>% mutate(library_id = gsub("_frag.bed", "", gsub("^.*lib", "lib", bed_file))) %>%
      select(library_id,gc_strata,fract_frags) %>%
      write.csv(file = distro_file, row.names = F)
    
        

Make healthy GC distributions summary file

  • Snakemake
    # Make healthy GC distributions summary file
    rule healthy_gc:
        benchmark: benchdir + "/cfdna_wgs_healthy_gc.benchmark.txt",
        container: cfdna_wgs_container,
        input: expand(cfdna_wgs_frag_gc_distros + "/{library}_gc_distro.csv", library = CFDNA_WGS_HEALTHY_LIBRARIES),
        log: logdir + "/cfdna_wgs_healthy_gc.log",
        output: cfdna_wgs_frag_gc_distros + "/healthy_med.rds",
        params:
            distro_dir = cfdna_wgs_frag_gc_distros,
            script = cfdna_wgs_scriptdir + "/healthy_gc.R",
        shell:
            """
            Rscript {params.script} \
            {params.distro_dir} \
            "{input}" \
            {output} > {log} 2>&1
            """
        
  • Rscript
    #!/usr/bin/env Rscript
    args = commandArgs(trailingOnly = TRUE)
    distro_dir = args[1]
    healthy_libs_str = args[2]
    healthy_med_file = args[3]
    
    library(tidyverse)
    
    healthy_libs_distros = unlist(strsplit(healthy_libs_str, " "))
    
    read_in_gc = function(gc_csv){
      read.csv(gc_csv, header = T)
    }
    
    healthy_list = lapply(healthy_libs_distros, read_in_gc)
    
    # Bind
    healthy_all = do.call(rbind, healthy_list)
    
    # Summarize
    healthy_med =
      healthy_all %>%
      group_by(gc_strata) %>%
      summarise(med_frag_fract = median(fract_frags))
    
    # Save
    saveRDS(healthy_med, file = healthy_med_file)
        

Sample fragments by healthy GC proportions

  • Snakemake
    # Sample fragments by healthy GC proportions
    rule cfdna_wgs_gc_sample:
        benchmark: benchdir + "/{library}_cfdna_wgs_gc_sample.benchmark.txt",
        container: cfdna_wgs_container,
        input:
            frag_bed = cfdna_wgs_frag_beds + "/{library}_filt.bed",
            healthy_med = cfdna_wgs_frag_gc_distros + "/healthy_med.rds",
        log: logdir + "/{library}_cfdna_wgs_gc_sample.log",
        output: cfdna_wgs_frag_beds + "/{library}_sampled_frag.bed",
        params:
            script = cfdna_wgs_scriptdir + "/gc_sample.R",
        shell:
            """
            Rscript {params.script} \
            {input.healthy_med} \
            {input.frag_bed} \
            {output} > {log} 2>&1
            """
        
  • Rscript
    #!/usr/bin/env Rscript
    args = commandArgs(trailingOnly = TRUE)
    healthy_med = args[1]
    frag_file = args[2]
    sampled_file = args[3]
    
    library(tidyverse)
    
    healthy_fract = readRDS(healthy_med)
    frag_file = read.table(frag_file, sep = '\t', header = F)
    
    frag_bed = frag_file
    names(frag_bed) = c("chr", "start", "end", "gc_raw", "len")
    
    frag = frag_bed %>%
      # Round off the GC strata
      mutate(gc_strata = round(gc_raw, 2)) %>%
      # Join the median count of fragments per strata in healthies
      # Use this later as sampling weight
      left_join(healthy_fract, by = "gc_strata")
    
    # Determine frags to sample by counts in strata for which healthies had highest count
    stratatotake = frag$gc_strata[which.max(frag$med_frag_fract)]
    fragsinmaxstrata = length(which(frag$gc_strata == stratatotake))
    fragstotake = round(fragsinmaxstrata/stratatotake)
    
    sampled = frag %>%
      filter(!is.na(med_frag_fract)) %>%
      slice_sample(., n = nrow(.), weight_by = med_frag_fract, replace = T) %>% select(chr, start, end, len, gc_strata)
    
    write.table(sampled, sep = "\t", col.names = F, row.names = F, quote = F, file = sampled_file)
        

Sum fragments in genomic windows by length

  • Snakemake
    # Sum fragments in short and long length groups
    
    rule frag_sum:
        benchmark: benchdir + "/{library}_frag_sum.benchmark.txt",
        container: cfdna_wgs_container,
        input: cfdna_wgs_frag_beds + "/{library}_sampled_frag.bed",
        log: logdir + "/{library}_cfdna_wgs_frag_window_sum.log",
        output:
            short = cfdna_wgs_frag_beds + "/{library}_norm_short.bed",
            long =  cfdna_wgs_frag_beds + "/{library}_norm_long.bed",
        params:
            script = cfdna_wgs_scriptdir + "/frag_window_sum.sh",
            threads = cfdna_wgs_threads,
        shell:
            """
            {params.script} \
            {input} \
            {output.short} {output.long} &> {log}
            """
        
  • Shell script
    #!/usr/bin/env bash
    input_frag="$1"
    output_short="$2"
    output_long="$3"
    
    # Functions
    make_short(){
        cat $1 | awk '{if ($4 >= 100 && $5 <= 150) print $0}' > $2
    }
    
    make_long(){
        cat $1 | awk '{if ($4 >= 151 && $5 <= 220) print $0}' > $2
    }
    
    # Run command
    make_short $input_frag $output_short
    make_long $input_frag $output_long
    
        

Count fragments intersecting windows

  • Snakemake
    # Count short and long fragments intersecting kept genomic windows
    
    rule frag_window_count:
        benchmark: benchdir + "/{library}_cfdna_wgs_frag_window_int.benchmark.txt",
        container: cfdna_wgs_container,
        input:
            short = cfdna_wgs_frag_beds + "/{library}_norm_short.bed",
            long = cfdna_wgs_frag_beds + "/{library}_norm_long.bed",
            matbed = refdir + "/keep_5mb.bed",
        log: logdir + "/{library}_cfdna_wgs_frag_window_int.log",
        output:
            short = cfdna_wgs_frag_counts + "/{library}_cnt_short.tmp",
            long = cfdna_wgs_frag_counts + "/{library}_cnt_long.tmp",
        params:
            script = cfdna_wgs_scriptdir + "/frag_window_int.sh",
            threads = threads,
        shell:
            """
            {params.script} \
            {input.short} \
            {input.matbed} \
            {output.short}
            {params.script} \
            {input.long} \
            {input.matbed} \
            {output.long}
            """
        
  • Shell script
    #!/usr/bin/env bash
    input=$1
    keep_bed=$2
    output=$3
    
    bedtools intersect -c \
                 -a $keep_bed \
                 -b $input > $output
    
        

Merge counts across length and library

  • Snakemake
    # Merge short and long fragment counts by genomic poistion for all libraries
    rule cfdna_wgs_count_merge:
        benchmark: benchdir + "/cfdna_wgs_count_merge.benchmark.txt",
        container: cfdna_wgs_container,
        input: expand(cfdna_wgs_frag_counts + "/{library}_cnt_{length}.tmp",  library = CFDNA_WGS_LIBRARIES, length = ["short","long"]),
        log: logdir + "/cfdna_wgs_count_merge.log",
        output:  cfdna_wgs_frag + "/frag_counts.tsv",
        params:
            counts_dir = cfdna_wgs_frag + "/counts",
            script = cfdna_wgs_scriptdir + "/count_merge.sh",
            threads = cfdna_wgs_threads,
        shell:
            """
            {params.script} \
            {params.counts_dir} \
            {output} &> {log}
            """
        
  • Shell script
    # For unit testing
    #counts_dir="/home/jeszyman/mpnst/analysis/cfdna-wgs/frag/counts"
    #out_tsv="/home/jeszyman/mpnst/analysis/cfdna-wgs/frag/frag_counts.tsv"
    
    # Define variables
    counts_dir="${1}"
    out_tsv="${2}"
    
    # Remove the existing aggregate file if present
    if [ -f $out_tsv ]; then rm $out_tsv; fi
    #touch $out_tsv
    
    # Make aggregate file
    for file in ${counts_dir}/*;
    do
        # Add file name to each line
        awk '{{print FILENAME (NF?"\t":"") $0}}' $file |
            # Modify file name to library id
            sed 's/^.*lib/lib/g' |
            sed 's/_.*_/\t/g' |
            # Cleanup "tmp"
            sed 's/.tmp//g' |
            # Send to output
            sed 's/\.bed//g' >> $out_tsv
    done
    
    # Add a header
    sed -i  '1 i\library	len_class	chr	start	end	gc	count' $out_tsv
    
        
    #!/usr/bin/env bash
    output=$1
    declare -a array2=$2
    
    if [ -f $output ]; then \rm $output; fi
    
    for file in ${array2[@]}; do
        awk '{{print FILENAME (NF?"\t":"") $0}}' $file |
            sed 's/^.*lib/lib/g' |
            sed 's/_.*_/\t/g' |
            # Cleanup "tmp"
            sed 's/.tmp//g' |
            sed 's/\.bed//g' >> $output
    done
    
    # Add a header
    sed -i  '1 i\library	len_class	chr	start	end	count' $out_tsv
    
        

Make a zero-centered, unit SD fragment file

  • Snakemake
    rule unit_cent_sd:
        benchmark: benchdir + "/unit_cent_sd.benchmark.txt",
        container: cfdna_wgs_container,
        input: cfdna_wgs_frag + "/frag_counts.tsv",
        log: logdir + "/unit_cent_sd.log",
        output: cfdna_wgs_frag + "/ratios.tsv",
        params:
            script = cfdna_wgs_scriptdir + "/make_ratios.R",
        shell:
            """
            Rscript {params.script} \
            {input} {output} > {log} 2>&1
            """
        
  • Rscript
    #!/usr/bin/env Rscript
    
    # For unit testing
    frags_tsv = "test/analysis/cfdna_wgs/frag/frag_counts.tsv"
    ratios_tsv = "/home/jeszyman/mpnst/analysis/cfdna-wgs/frag/ratios.tsv"
    
    args = commandArgs(trailingOnly = TRUE)
    frags_tsv = args[1]
    ratios_tsv = args[2]
    
    # Load necessary packages
    library(tidyverse)
    
    # Load aggregate frag tsv
    frags = read_tsv(frags_tsv)
    
    # From per-position, per library short and long fragment counts, zero-centered fragment ratio
    # See https://github.com/cancer-genomics/reproduce_lucas_wflow/blob/master/analysis/fig2a.Rmd
    
    ratios =
      frags %>%
      mutate_at(vars(start, end, count), as.numeric) %>%
      # Put lib-bin short and long values on same row in order to make per-row ratios
      pivot_wider(names_from = len_class, values_from = count, values_fn = function(x) mean(x)) %>%
      mutate(fract = short/long) %>%
      select(library, chr, start, end, fract) %>%
      # Zero center by library
      group_by(library) %>%
      mutate(ratio.centered = scale(fract, scale=F)[,1])
    
    write_tsv(ratios, file = ratios_tsv)
        

Reference

  • Based on cfDNA fragmentomics cite:mathios2021

Development

Ideas

Preamble

#########1#########2#########3#########4#########5#########6#########7#########8
#                                                                              #
#      Integration Testing Snakefile for Analysis of Cell-free DNA             #
#    Whole Genome Sequencing Copy Number Alteration and Fragmentomics          #
#                                                                              #
#########1#########2#########3#########4#########5#########6#########7#########8

# Load necessary packages for snakemake run
import pandas as pd
import re
import numpy as np

Variable naming

# Variable naming
benchdir = config["benchdir"]
cfdna_wgs_repo = config["cfdna_wgs_repo"]
cfdna_wgs_scriptdir = config["cfdna_wgs_scriptdir"]
logdir = config["logdir"]
threads = config["threads"]

# Suggested directory structure:
analysis = config["datadir"] + "/analysis"
cfdna_wgs = config["datadir"]      + "/analysis/cfdna_wgs"
cfdna_wgs_cna = config["datadir"]  + "/analysis/cfdna_wgs/cna"
cfdna_wgs_frag = config["datadir"] + "/analysis/cfdna_wgs/frag"

# Terminal variable paths:
#  (These variables are used directly in the cna snakefile)
cfdna_wgs_cna_in_bams      = cfdna_wgs_cna + "/input_bams"
cfdna_wgs_cna_frag_bams    = cfdna_wgs_cna + "/frag_bams"
cfdna_wgs_cna_wigs         = cfdna_wgs_cna + "/wigs"
cfdna_wgs_cna_ichor_nopon  = cfdna_wgs_cna + "/ichor_nopon"

cfdna_wgs_frag_input_bams  = cfdna_wgs_cna + "/input_bams"
cfdna_wgs_frag_beds       = cfdna_wgs_frag + "/beds"

cfdna_wgs_frag_counts     = cfdna_wgs_frag + "/counts"

refdir                 = config["datadir"] + "/ref"

# Additional variable names used directly in the cna snakefile:
chrom_sizes = config["chrom_sizes"]
genome_fasta = "/mnt/ris/aadel/Active/mpnst/inputs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna"


#TMP_FRAG_LIBS = ["lib001_filt","lib002_filt"]

#chrs = "chr8"

chrs = "chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY,chrM",

keep_bed = refdir + "/hg38_keep.bed",
blklist = config["blklist"]
genome_ref = config["genome_ref"]


FRAG_DISTROS = config["frag_distro"]

cfdna_wgs_threads = config["threads"]
cfdna_wgs_scriptdir = config["cfdna_wgs_scriptdir"]


cfdna_wgs_container = config["cfdna_wgs_container"]
default_container = config["default_container"]

autosome_bed = refdir + "/hg38_autosomes.bed",
cfdna_wgs_fastqs = cfdna_wgs + "/fastqs"
cfdna_wgs_bams = cfdna_wgs + "/bams"
qc = config["datadir"] + "/qc"

# cfdna_wgs_container = config["cfdna_wgs_container"]


# cfdna_wgs_cna_bam_inputs   = config["dir"]["data"] + "/bam/filt"
# cfdna_wgs_cna_bam_fragfilt = config["dir"]["data"] + "/bam/frag"

# wig = config["dir"]["data"] + "/wig"
# ichor = config["dir"]["data"] + "/ichor"
# cfdna_wgs_logs = config["dir"]["data"] + "logs/cfdna_wgs"
# ichor_nopon = config["dir"]["data"] + "/ichor_nopon"

Functions

libraries = pd.read_table(config["datadir"] + "/inputs/libraries.tsv")

readable = []
for x in libraries.file:
    readable.append(os.access(x, os.R_OK))
libraries['readable']=readable

cfdna_libraries = libraries
cfdna_libraries = cfdna_libraries[cfdna_libraries.library_type == "wgs"]
cfdna_libraries = cfdna_libraries[cfdna_libraries.isolation_type == "cfdna"]
cfdna_libraries = cfdna_libraries[cfdna_libraries.readable == True]

library_indict = cfdna_libraries["library"].tolist()
file_indict = cfdna_libraries["file"].tolist()
lib_dict = dict(zip(library_indict, file_indict))

CFDNA_WGS_LIBRARIES = list(lib_dict.keys())

cna_libraries = pd.read_table(config["datadir"] + "/inputs/cna_libraries.tsv")

readable = []
for x in cna_libraries.bam_file:
    readable.append(os.access(x, os.R_OK))
cna_libraries['readable']=readable

cna_libraries = cna_libraries[cna_libraries.readable == True]

library_indict = cna_libraries["library"].tolist()
file_indict = cna_libraries["bam_file"].tolist()
lib_dict = dict(zip(library_indict, file_indict))

CNA_WGS_LIBRARIES = list(lib_dict.keys())

All rule

rule all:
    input:
# # From this snakefile:
#         # cfdna_wgs_symlink:
#         expand(cfdna_wgs_cna_in_bams +
#                "/{library}.bam",
#                library = lib_dict.keys()),
# # From cna.smk
#         # cna_frag_filt:
#         expand(cfdna_wgs_cna_frag_bams +
#                "/{library}_frag{frag_distro}.bam",
#                library = CNA_WGS_LIBRARIES,
#                frag_distro = FRAG_DISTROS),
#         # bam_to_wig:
#         expand(cfdna_wgs_cna_wigs +
#                "/{library}_frag{frag_distro}.wig",
#                library = CNA_WGS_LIBRARIES,
#                frag_distro = FRAG_DISTROS),
#         # ichor_nopon:
#         expand(cfdna_wgs_cna_ichor_nopon +
#                "/{library}_frag{frag_distro}.cna.seg",
#                library = CNA_WGS_LIBRARIES,
#                frag_distro = FRAG_DISTROS),
# From frag.smk
        # make_gc_map_bind:
        refdir + "/keep_5mb.bed",
        # filt_bam_to_frag_bed:
        expand(cfdna_wgs_frag_beds +
               "/{library}_filt.bed",
               library = CNA_WGS_LIBRARIES),
        # # gc_distro:
        # expand(cfdna_wgs_frag_gc_distros +
        #        "/{library}_gc_distro.csv",
        #        library = CNA_WGS_LIBRARIES),
        # # healthy_gc:
        # cfdna_wgs_frag_gc_distros + "/healthy_med.rds",
        # #
        # expand(cfdna_wgs_frag_beds +
        #        "/{library}_sampled_frag.bed",
        #       library = CNA_WGS_LIBRARIES),
        # expand(cfdna_wgs_frag_beds) /
        #        "{library}_norm_{length}.bed",
        #        library = CNA_WGS_LIBRARIES,
        #        length = ["short", "long"]),
        expand(cfdna_wgs_frag_counts +
               "/{library}_cnt_{length}.tmp",
               library = CNA_WGS_LIBRARIES,
               length = ["short", "long"]),
        cfdna_wgs_frag + "/frag_counts.tsv",
        #
        # unit_cent_sd:
        cfdna_wgs_frag + "/ratios.tsv",

Symlink input bams

  • Snakemake
    # Symlink input bams
    rule cfdna_wgs_symlink:
        container: cfdna_wgs_container,
        input: lambda wildcards: lib_dict[wildcards.library],
        output: cfdna_wgs_cna_in_bams + "/{library}.bam",
        shell:
            """
            ln --force --relative --symbolic {input} {output}
            """
        

Includes statements

include: cfdna_wgs_repo + "/workflow/reads.smk"
include: cfdna_wgs_repo + "/workflow/cna.smk"
include: cfdna_wgs_repo + "/workflow/frag.smk"

README

Introduction

This repository hosts a snakemake workflow for basic processing of whole-genome sequencing reads from cell-free DNA.

resources/int_test.png

Organization

Master branch of the repository contains most recent developments while stable versions are saved as terminal branches (e.g. stable.1.0.0).

Directory workflow contains two types of workflows- process-focused snakefiles (reads.smk, cna.smk, frag.smk) suitable for integration into another snakemake pipeline using the :include command, and the _int_test snakefile with examples of such integration using the repository test data.

Use

Changelog

  • [2023-01-26 Thu] - Version 9.1.0: Repo cleanup
  • [2023-01-26 Thu] - Version 9.0.0: Removed -f 3 flag for perfectly matched pairs in samtools filtering as the black from BWA removes some fragments at a set max length. Added framework for benchmark analysis. Added conditional execution of downsampling. Removed (temporarily) final wig and ichor commands of CNA as these don’t currently run correctly without full genome alignment, so can’t be validated on test data. Added local documentation of cfdna-wgs dockerfile.
  • [2023-01-21 Sat] - Version 8.0.0: Corrected rule filt_bam_to_frag_bed to fix mates of inputs, which seems to prevent errors in the bamtobed call. Frag_window_count now uses windows of consistent 5 Mb size, which are generated from rule make_gc_map_bind. Added a merged fragment counts file and zero-centered unit SD counts.
  • [2022-12-07 Wed] - Version 7.0.0: Added copy number alteration and DELFI fragmentomics.
  • [2022-10-17 Mon] - Version 6.0.0: Using fastp for read trimming (replaces trimmomatic). Simplified naming schema. Removed downsampling (will reinstate in later version).
  • [2022-09-08 Thu] - Version 5.3.0: some minor name changes
  • [2022-08-19 Fri] - Version 5.2.0 validated: Adds bamCoverage and plotCoverage from deeptools. Benchmarks BWA.
  • [2022-08-09 Tue] - Version 5.1.0 validated: Added cfdna wgs-specific container for each rule, referenced to config
  • [2022-08-05 Fri] - Version 5.0.0 validated: Added a symlink rule based on python dictionary. Added repo-specific output naming, added checks for sequence type and file readability to input tsv.
  • [2022-06-27 Mon] - Version 4 validated. Further expanded read_qc.tsv table. Removed bam post-processing step and added a more expansive bam filtering step. Updated downsampling to work off filtered alignments.
  • [2022-06-26 Sun] - Version 3.2 validated. Expanded the qc aggregate table and added some comments.
  • [2022-06-24 Fri] - Validate version 3.1 which includes genome index build as a snakefile rule.
  • [2022-06-24 Fri] - Validated version 3 with read number checkpoint for down-sampling.
  • [2022-05-31 Tue] - Conforms to current biotools best practices.
  • [2022-04-29 Fri] - Moved multiqc to integration testing as inputs are dependent on final sample labels. Integration testing works per this commit.

Development

Version 9

  • kill v7- not working for CNA

Project stable version update

Make benchmarking table

  • for file in ./*; do base=$(basename $file); $(str = tail -n1 $file); echo $base $str; done

Analysis of copy number alteration

Preamble

#########1#########2#########3#########4#########5#########6#########7#########8
#                                                                              #
#                    Copy-number Alteration Analysis of                        #
#                  Cell-free DNA Whole Genome Sequencing                       #
#                                                                              #
#                                                                              #
#########1#########2#########3#########4#########5#########6#########7#########8

Convert bam to wig

  • Snakemake
    # Use readCounter to create windowed wig from bam file
    rule bam_to_wig:
        benchmark: benchdir + "/{library}_ds{downsample}_{frag_distro}_cfdna_wgs_bam_to_wig.benchmark.txt",
        container: cfdna_wgs_container,
        input: cfdna_wgs_bams + "/{library}_ds{downsample}_frag{frag_distro}.bam",
        log: logdir + "/{library}_ds{downsample}_{frag_distro}_cfdna_wgs_bam_to_wig.log",
        output: cfdna_wgs_wigs + "/{library}_ds{downsample}_frag{frag_distro}.wig",
        params:
            chrs = chrs,
            outdir = cfdna_wgs_wigs,
            script = cfdna_wgs_scriptdir + "/bam_to_wig.sh",
            threads = cfdna_wgs_threads,
        shell:
            """
            mkdir -p {params.outdir}
            /opt/hmmcopy_utils/bin/readCounter \
            --chromosome "{params.chrs}" \
            --quality 20 \
            --window 1000000 \
            {input} > {output}
            """
        
  • Shell script
    #!/usr/bin/env bash
    input=$1
    output=$2
    
            /opt/hmmcopy_utils/bin/readCounter --window 1000000 --quality 20 \
            --chromosome {params.chrs} \
            {input} > {output}
    
        

Run ichorCNA without a panel of normals

  • Snakemake
    # Run ichorCNA without a panel of normals
    rule ichor_nopon:
        input: cfdna_wgs_wigs + "/{library}_ds{downsample}_frag{frag_distro}.wig",
        output: cfdna_wgs_ichor_nopon + "/{library}_ds{downsample}_frag{frag_distro}.cna.seg",
        params:
            script = cfdna_wgs_scriptdir + "/MOD_runIchorCNA.R",
            out_dir = cfdna_wgs_ichor_nopon,
        container:
            cfdna_wgs_container,
        shell:
            """
            Rscript {params.script} \
             --id {wildcards.library}_frag{wildcards.frag_distro} \
             --WIG {input} \
             --gcWig /opt/ichorCNA/inst/extdata/gc_hg38_1000kb.wig \
             --mapWig /opt/ichorCNA/inst/extdata/map_hg38_1000kb.wig \
             --centromere /opt/ichorCNA/inst/extdata/GRCh38.GCA_000001405.2_centromere_acen.txt \
             --normal "c(0.95, 0.99, 0.995, 0.999)" \
             --ploidy "c(2)" \
             --maxCN 3 \
             --estimateScPrevalence FALSE \
             --scStates "c()" \
             --outDir {params.out_dir}
            """
        
  • Rscript
    # file:   ichorCNA.R
    # authors: Gavin Ha, Ph.D.
    #          Fred Hutch
    # contact: <gha@fredhutch.org>
    #
    #         Justin Rhoades
    #          Broad Institute
    # contact: <rhoades@broadinstitute.org>
    
    # ichorCNA: https://github.com/broadinstitute/ichorCNA
    # date:   July 24, 2019
    # description: Hidden Markov model (HMM) to analyze Ultra-low pass whole genome sequencing (ULP-WGS) data.
    # This script is the main script to run the HMM.
    
    library(optparse)
    
    option_list <- list(
      make_option(c("--WIG"), type = "character", help = "Path to tumor WIG file. Required."),
      make_option(c("--NORMWIG"), type = "character", default=NULL, help = "Path to normal WIG file. Default: [%default]"),
      make_option(c("--gcWig"), type = "character", help = "Path to GC-content WIG file; Required"),
      make_option(c("--mapWig"), type = "character", default=NULL, help = "Path to mappability score WIG file. Default: [%default]"),
      make_option(c("--normalPanel"), type="character", default=NULL, help="Median corrected depth from panel of normals. Default: [%default]"),
      make_option(c("--exons.bed"), type = "character", default=NULL, help = "Path to bed file containing exon regions. Default: [%default]"),
      make_option(c("--id"), type = "character", default="test", help = "Patient ID. Default: [%default]"),
      make_option(c("--centromere"), type="character", default=NULL, help = "File containing Centromere locations; if not provided then will use hg19 version from ichorCNA package. Default: [%default]"),
      make_option(c("--minMapScore"), type = "numeric", default=0.9, help="Include bins with a minimum mappability score of this value. Default: [%default]."),
      make_option(c("--rmCentromereFlankLength"), type="numeric", default=1e5, help="Length of region flanking centromere to remove. Default: [%default]"),
      make_option(c("--normal"), type="character", default="0.5", help = "Initial normal contamination; can be more than one value if additional normal initializations are desired. Default: [%default]"),
      make_option(c("--scStates"), type="character", default="NULL", help = "Subclonal states to consider. Default: [%default]"),
      make_option(c("--coverage"), type="numeric", default=NULL, help = "PICARD sequencing coverage. Default: [%default]"),
      make_option(c("--lambda"), type="character", default="NULL", help="Initial Student's t precision; must contain 4 values (e.g. c(1500,1500,1500,1500)); if not provided then will automatically use based on variance of data. Default: [%default]"),
      make_option(c("--lambdaScaleHyperParam"), type="numeric", default=3, help="Hyperparameter (scale) for Gamma prior on Student's-t precision. Default: [%default]"),
      #	make_option(c("--kappa"), type="character", default=50, help="Initial state distribution"),
      make_option(c("--ploidy"), type="character", default="2", help = "Initial tumour ploidy; can be more than one value if additional ploidy initializations are desired. Default: [%default]"),
      make_option(c("--maxCN"), type="numeric", default=7, help = "Total clonal CN states. Default: [%default]"),
      make_option(c("--estimateNormal"), type="logical", default=TRUE, help = "Estimate normal. Default: [%default]"),
      make_option(c("--estimateScPrevalence"), type="logical", default=TRUE, help = "Estimate subclonal prevalence. Default: [%default]"),
      make_option(c("--estimatePloidy"), type="logical", default=TRUE, help = "Estimate tumour ploidy. Default: [%default]"),
      make_option(c("--maxFracCNASubclone"), type="numeric", default=0.7, help="Exclude solutions with fraction of subclonal events greater than this value. Default: [%default]"),
      make_option(c("--maxFracGenomeSubclone"), type="numeric", default=0.5, help="Exclude solutions with subclonal genome fraction greater than this value. Default: [%default]"),
      make_option(c("--minSegmentBins"), type="numeric", default=50, help="Minimum number of bins for largest segment threshold required to estimate tumor fraction; if below this threshold, then will be assigned zero tumor fraction."),
      make_option(c("--altFracThreshold"), type="numeric", default=0.05, help="Minimum proportion of bins altered required to estimate tumor fraction; if below this threshold, then will be assigned zero tumor fraction. Default: [%default]"),
      make_option(c("--chrNormalize"), type="character", default="c(1:22)", help = "Specify chromosomes to normalize GC/mappability biases. Default: [%default]"),
      make_option(c("--chrTrain"), type="character", default="c(1:22)", help = "Specify chromosomes to estimate params. Default: [%default]"),
      make_option(c("--chrs"), type="character", default="c(1:22,\"X\")", help = "Specify chromosomes to analyze. Default: [%default]"),
      make_option(c("--genomeBuild"), type="character", default="hg19", help="Geome build. Default: [%default]"),
      make_option(c("--genomeStyle"), type = "character", default = "NCBI", help = "NCBI or UCSC chromosome naming convention; use UCSC if desired output is to have \"chr\" string. [Default: %default]"),
      make_option(c("--normalizeMaleX"), type="logical", default=TRUE, help = "If male, then normalize chrX by median. Default: [%default]"),
      make_option(c("--minTumFracToCorrect"), type="numeric", default=0.1, help = "Tumor-fraction correction of bin and segment-level CNA if sample has minimum estimated tumor fraction. [Default: %default]"),
      make_option(c("--fracReadsInChrYForMale"), type="numeric", default=0.001, help = "Threshold for fraction of reads in chrY to assign as male. Default: [%default]"),
      make_option(c("--includeHOMD"), type="logical", default=FALSE, help="If FALSE, then exclude HOMD state. Useful when using large bins (e.g. 1Mb). Default: [%default]"),
      make_option(c("--txnE"), type="numeric", default=0.9999999, help = "Self-transition probability. Increase to decrease number of segments. Default: [%default]"),
      make_option(c("--txnStrength"), type="numeric", default=1e7, help = "Transition pseudo-counts. Exponent should be the same as the number of decimal places of --txnE. Default: [%default]"),
      make_option(c("--plotFileType"), type="character", default="pdf", help = "File format for output plots. Default: [%default]"),
    	make_option(c("--plotYLim"), type="character", default="c(-2,2)", help = "ylim to use for chromosome plots. Default: [%default]"),
      make_option(c("--outDir"), type="character", default="./", help = "Output Directory. Default: [%default]"),
      make_option(c("--libdir"), type = "character", default=NULL, help = "Script library path. Usually exclude this argument unless custom modifications have been made to the ichorCNA R package code and the user would like to source those R files. Default: [%default]")
    )
    parseobj <- OptionParser(option_list=option_list)
    opt <- parse_args(parseobj)
    print(opt)
    options(scipen=0, stringsAsFactors=F)
    
    library(HMMcopy)
    library(GenomicRanges)
    library(GenomeInfoDb)
    options(stringsAsFactors=FALSE)
    options(bitmapType='cairo')
    
    patientID <- opt$id
    tumour_file <- opt$WIG
    normal_file <- opt$NORMWIG
    gcWig <- opt$gcWig
    mapWig <- opt$mapWig
    normal_panel <- opt$normalPanel
    exons.bed <- opt$exons.bed  # "0" if none specified
    centromere <- opt$centromere
    minMapScore <- opt$minMapScore
    flankLength <- opt$rmCentromereFlankLength
    normal <- eval(parse(text = opt$normal))
    scStates <- eval(parse(text = opt$scStates))
    lambda <- eval(parse(text = opt$lambda))
    lambdaScaleHyperParam <- opt$lambdaScaleHyperParam
    estimateNormal <- opt$estimateNormal
    estimatePloidy <- opt$estimatePloidy
    estimateScPrevalence <- opt$estimateScPrevalence
    maxFracCNASubclone <- opt$maxFracCNASubclone
    maxFracGenomeSubclone <- opt$maxFracGenomeSubclone
    minSegmentBins <- opt$minSegmentBins
    altFracThreshold <- opt$altFracThreshold
    ploidy <- eval(parse(text = opt$ploidy))
    coverage <- opt$coverage
    maxCN <- opt$maxCN
    txnE <- opt$txnE
    txnStrength <- opt$txnStrength
    normalizeMaleX <- as.logical(opt$normalizeMaleX)
    includeHOMD <- as.logical(opt$includeHOMD)
    minTumFracToCorrect <- opt$minTumFracToCorrect
    fracReadsInChrYForMale <- opt$fracReadsInChrYForMale
    chrXMedianForMale <- -0.1
    outDir <- opt$outDir
    libdir <- opt$libdir
    plotFileType <- opt$plotFileType
    plotYLim <- eval(parse(text=opt$plotYLim))
    gender <- NULL
    outImage <- paste0(outDir,"/", patientID,".RData")
    genomeBuild <- opt$genomeBuild
    genomeStyle <- opt$genomeStyle
    chrs <- as.character(eval(parse(text = opt$chrs)))
    chrTrain <- as.character(eval(parse(text=opt$chrTrain)));
    chrNormalize <- as.character(eval(parse(text=opt$chrNormalize)));
    seqlevelsStyle(chrs) <- genomeStyle
    seqlevelsStyle(chrNormalize) <- genomeStyle
    seqlevelsStyle(chrTrain) <- genomeStyle
    
    ## load ichorCNA library or source R scripts
    if (!is.null(libdir) && libdir != "None"){
    	source(paste0(libdir,"/R/utils.R"))
    	source(paste0(libdir,"/R/segmentation.R"))
    	source(paste0(libdir,"/R/EM.R"))
    	source(paste0(libdir,"/R/output.R"))
    	source(paste0(libdir,"/R/plotting.R"))
    } else {
        library(ichorCNA)
    }
    
    ## load seqinfo
    seqinfo <- getSeqInfo(genomeBuild, genomeStyle)
    
    if (substr(tumour_file,nchar(tumour_file)-2,nchar(tumour_file)) == "wig") {
      wigFiles <- data.frame(cbind(patientID, tumour_file))
    } else {
      wigFiles <- read.delim(tumour_file, header=F, as.is=T)
    }
    
    ## FILTER BY EXONS IF PROVIDED ##
    ## add gc and map to GRanges object ##
    if (is.null(exons.bed) || exons.bed == "None" || exons.bed == "NULL"){
      targetedSequences <- NULL
    }else{
      targetedSequences <- read.delim(exons.bed, header=T, sep="\t")
    }
    
    ## load PoN
    if (is.null(normal_panel) || normal_panel == "None" || normal_panel == "NULL"){
    	normal_panel <- NULL
    }
    
    if (is.null(centromere) || centromere == "None" || centromere == "NULL"){ # no centromere file provided
    	centromere <- system.file("extdata", "GRCh37.p13_centromere_UCSC-gapTable.txt",
    			package = "ichorCNA")
    }
    centromere <- read.delim(centromere,header=T,stringsAsFactors=F,sep="\t")
    save.image(outImage)
    ## LOAD IN WIG FILES ##
    numSamples <- nrow(wigFiles)
    
    tumour_copy <- list()
    for (i in 1:numSamples) {
      id <- wigFiles[i,1]
      ## create output directories for each sample ##
      dir.create(paste0(outDir, "/", id, "/"), recursive = TRUE)
      ### LOAD TUMOUR AND NORMAL FILES ###
      message("Loading tumour file:", wigFiles[i,1])
      tumour_reads <- wigToGRanges(wigFiles[i,2])
    
      ## LOAD GC/MAP WIG FILES ###
      # find the bin size and load corresponding wig files #
      binSize <- as.data.frame(tumour_reads[1,])$width
      message("Reading GC and mappability files")
      if (is.null(gcWig) || gcWig == "None" || gcWig == "NULL"){
          stop("GC wig file is required")
      }
      gc <- wigToGRanges(gcWig)
      if (is.null(mapWig) || mapWig == "None" || mapWig == "NULL"){
          message("No mappability wig file input, excluding from correction")
          map <- NULL
      } else {
          map <- wigToGRanges(mapWig)
      }
      message("Correcting Tumour")
    
      counts <- loadReadCountsFromWig(tumour_reads, chrs = chrs, gc = gc, map = map,
                                           centromere = centromere, flankLength = flankLength,
                                           targetedSequences = targetedSequences, chrXMedianForMale = chrXMedianForMale,
                                           genomeStyle = genomeStyle, fracReadsInChrYForMale = fracReadsInChrYForMale,
                                           chrNormalize = chrNormalize, mapScoreThres = minMapScore)
      tumour_copy[[id]] <- counts$counts #as(counts$counts, "GRanges")
      gender <- counts$gender
      ## load in normal file if provided
      if (!is.null(normal_file) && normal_file != "None" && normal_file != "NULL"){
    	message("Loading normal file:", normal_file)
    	normal_reads <- wigToGRanges(normal_file)
    	message("Correcting Normal")
    	counts <- loadReadCountsFromWig(normal_reads, chrs=chrs, gc=gc, map=map,
    			centromere=centromere, flankLength = flankLength, targetedSequences=targetedSequences,
    			genomeStyle = genomeStyle, chrNormalize = chrNormalize, mapScoreThres = minMapScore)
    	normal_copy <- counts$counts #as(counts$counts, "GRanges")
    	gender.normal <- counts$gender
      }else{
    	normal_copy <- NULL
      }
    
      ### DETERMINE GENDER ###
      ## if normal file not given, use chrY, else use chrX
      message("Determining gender...", appendLF = FALSE)
      gender.mismatch <- FALSE
      if (!is.null(normal_copy)){
    	if (gender$gender != gender.normal$gender){ #use tumour # use normal if given
    	# check if normal is same gender as tumour
    	  gender.mismatch <- TRUE
    	}
      }
      message("Gender ", gender$gender)
    
      ## NORMALIZE GENOME-WIDE BY MATCHED NORMAL OR NORMAL PANEL (MEDIAN) ##
      tumour_copy[[id]] <- normalizeByPanelOrMatchedNormal(tumour_copy[[id]], chrs = chrs,
          normal_panel = normal_panel, normal_copy = normal_copy,
          gender = gender$gender, normalizeMaleX = normalizeMaleX)
    
    	### OUTPUT FILE ###
    	### PUTTING TOGETHER THE COLUMNS IN THE OUTPUT ###
    	outMat <- as.data.frame(tumour_copy[[id]])
    	#outMat <- outMat[,c(1,2,3,12)]
    	outMat <- outMat[,c("seqnames","start","end","copy")]
    	colnames(outMat) <- c("chr","start","end","log2_TNratio_corrected")
    	outFile <- paste0(outDir,"/",id,".correctedDepth.txt")
    	message(paste("Outputting to:", outFile))
    	write.table(outMat, file=outFile, row.names=F, col.names=T, quote=F, sep="\t")
    
    } ## end of for each sample
    
    chrInd <- as.character(seqnames(tumour_copy[[1]])) %in% chrTrain
    ## get positions that are valid
    valid <- tumour_copy[[1]]$valid
    if (length(tumour_copy) >= 2) {
      for (i in 2:length(tumour_copy)){
        valid <- valid & tumour_copy[[i]]$valid
      }
    }
    save.image(outImage)
    
    ### RUN HMM ###
    ## store the results for different normal and ploidy solutions ##
    ptmTotalSolutions <- proc.time() # start total timer
    results <- list()
    loglik <- as.data.frame(matrix(NA, nrow = length(normal) * length(ploidy), ncol = 7,
                     dimnames = list(c(), c("init", "n_est", "phi_est", "BIC",
                     												"Frac_genome_subclonal", "Frac_CNA_subclonal", "loglik"))))
    counter <- 1
    compNames <- rep(NA, nrow(loglik))
    mainName <- rep(NA, length(normal) * length(ploidy))
    #### restart for purity and ploidy values ####
    for (n in normal){
      for (p in ploidy){
        if (n == 0.95 & p != 2) {
            next
        }
        logR <- as.data.frame(lapply(tumour_copy, function(x) { x$copy })) # NEED TO EXCLUDE CHR X #
        param <- getDefaultParameters(logR[valid & chrInd, , drop=F], maxCN = maxCN, includeHOMD = includeHOMD,
                    ct.sc=scStates, ploidy = floor(p), e=txnE, e.same = 50, strength=txnStrength)
        param$phi_0 <- rep(p, numSamples)
        param$n_0 <- rep(n, numSamples)
    
        ############################################
        ######## CUSTOM PARAMETER SETTINGS #########
        ############################################
        # 0.1x cfDNA #
        if (is.null(lambda)){
    			logR.var <- 1 / ((apply(logR, 2, sd, na.rm = TRUE) / sqrt(length(param$ct))) ^ 2)
    			param$lambda <- rep(logR.var, length(param$ct))
    			param$lambda[param$ct %in% c(2)] <- logR.var
    			param$lambda[param$ct %in% c(1,3)] <- logR.var
    			param$lambda[param$ct >= 4] <- logR.var / 5
    			param$lambda[param$ct == max(param$ct)] <- logR.var / 15
    			param$lambda[param$ct.sc.status] <- logR.var / 10
        }else{
    			param$lambda[param$ct %in% c(2)] <- lambda[2]
    			param$lambda[param$ct %in% c(1)] <- lambda[1]
    			param$lambda[param$ct %in% c(3)] <- lambda[3]
    			param$lambda[param$ct >= 4] <- lambda[4]
    			param$lambda[param$ct == max(param$ct)] <- lambda[2] / 15
    			param$lambda[param$ct.sc.status] <- lambda[2] / 10
    		}
    		param$alphaLambda <- rep(lambdaScaleHyperParam, length(param$ct))
        # 1x bulk tumors #
        #param$lambda[param$ct %in% c(2)] <- 2000
        #param$lambda[param$ct %in% c(1)] <- 1750
        #param$lambda[param$ct %in% c(3)] <- 1750
        #param$lambda[param$ct >= 4] <- 1500
        #param$lambda[param$ct == max(param$ct)] <- 1000 / 25
    		#param$lambda[param$ct.sc.status] <- 1000 / 75
    		#param$alphaLambda[param$ct.sc.status] <- 4
    		#param$alphaLambda[param$ct %in% c(1,3)] <- 5
    		#param$alphaLambda[param$ct %in% c(2)] <- 5
    		#param$alphaLambda[param$ct == max(param$ct)] <- 4
    
    		#############################################
    		################ RUN HMM ####################
    		#############################################
        hmmResults.cor <- HMMsegment(tumour_copy, valid, dataType = "copy",
                                     param = param, chrTrain = chrTrain, maxiter = 50,
                                     estimateNormal = estimateNormal, estimatePloidy = estimatePloidy,
                                     estimateSubclone = estimateScPrevalence, verbose = TRUE)
    
        for (s in 1:numSamples){
      		iter <- hmmResults.cor$results$iter
      		id <- names(hmmResults.cor$cna)[s]
    
      		## convert full diploid solution (of chrs to train) to have 1.0 normal or 0.0 purity
      		## check if there is an altered segment that has at least a minimum # of bins
      		segsS <- hmmResults.cor$results$segs[[s]]
      		segsS <- segsS[segsS$chr %in% chrTrain, ]
      		segAltInd <- which(segsS$event != "NEUT")
      		maxBinLength = -Inf
      		if (sum(segAltInd) > 0){
      			maxInd <- which.max(segsS$end[segAltInd] - segsS$start[segAltInd] + 1)
      			maxSegRD <- GRanges(seqnames=segsS$chr[segAltInd[maxInd]],
      								ranges=IRanges(start=segsS$start[segAltInd[maxInd]], end=segsS$end[segAltInd[maxInd]]))
      			hits <- findOverlaps(query=maxSegRD, subject=tumour_copy[[s]][valid, ])
      			maxBinLength <- length(subjectHits(hits))
      		}
      		## check if there are proportion of total bins altered
      		# if segment size smaller than minSegmentBins, but altFrac > altFracThreshold, then still estimate TF
      		cnaS <- hmmResults.cor$cna[[s]]
      		altInd <- cnaS[cnaS$chr %in% chrTrain, "event"] == "NEUT"
      		altFrac <- sum(!altInd, na.rm=TRUE) / length(altInd)
      		if ((maxBinLength <= minSegmentBins) & (altFrac <= altFracThreshold)){
      			hmmResults.cor$results$n[s, iter] <- 1.0
      		}
    
          # correct integer copy number based on estimated purity and ploidy
          correctedResults <- correctIntegerCN(cn = hmmResults.cor$cna[[s]],
                segs = hmmResults.cor$results$segs[[s]],
                purity = 1 - hmmResults.cor$results$n[s, iter], ploidy = hmmResults.cor$results$phi[s, iter],
                cellPrev = 1 - hmmResults.cor$results$sp[s, iter],
                maxCNtoCorrect.autosomes = maxCN, maxCNtoCorrect.X = maxCN, minPurityToCorrect = minTumFracToCorrect,
                gender = gender$gender, chrs = chrs, correctHOMD = includeHOMD)
          hmmResults.cor$results$segs[[s]] <- correctedResults$segs
          hmmResults.cor$cna[[s]] <- correctedResults$cn
    
          	## plot solution ##
      		outPlotFile <- paste0(outDir, "/", id, "/", id, "_genomeWide_", "n", n, "-p", p)
      		mainName[counter] <- paste0(id, ", n: ", n, ", p: ", p, ", log likelihood: ", signif(hmmResults.cor$results$loglik[hmmResults.cor$results$iter], digits = 4))
      		plotGWSolution(hmmResults.cor, s=s, outPlotFile=outPlotFile, plotFileType=plotFileType,
                logR.column = "logR", call.column = "Corrected_Call",
      					 plotYLim=plotYLim, estimateScPrevalence=estimateScPrevalence, seqinfo=seqinfo, main=mainName[counter])
        }
        iter <- hmmResults.cor$results$iter
        results[[counter]] <- hmmResults.cor
        loglik[counter, "loglik"] <- signif(hmmResults.cor$results$loglik[iter], digits = 4)
        subClonalBinCount <- unlist(lapply(hmmResults.cor$cna, function(x){ sum(x$subclone.status) }))
        fracGenomeSub <- subClonalBinCount / unlist(lapply(hmmResults.cor$cna, function(x){ nrow(x) }))
        fracAltSub <- subClonalBinCount / unlist(lapply(hmmResults.cor$cna, function(x){ sum(x$copy.number != 2) }))
        fracAltSub <- lapply(fracAltSub, function(x){if (is.na(x)){0}else{x}})
        loglik[counter, "Frac_genome_subclonal"] <- paste0(signif(fracGenomeSub, digits=2), collapse=",")
        loglik[counter, "Frac_CNA_subclonal"] <- paste0(signif(as.numeric(fracAltSub), digits=2), collapse=",")
        loglik[counter, "init"] <- paste0("n", n, "-p", p)
        loglik[counter, "n_est"] <- paste(signif(hmmResults.cor$results$n[, iter], digits = 2), collapse = ",")
        loglik[counter, "phi_est"] <- paste(signif(hmmResults.cor$results$phi[, iter], digits = 4), collapse = ",")
    
        counter <- counter + 1
      }
    }
    ## get total time for all solutions ##
    elapsedTimeSolutions <- proc.time() - ptmTotalSolutions
    message("Total ULP-WGS HMM Runtime: ", format(elapsedTimeSolutions[3] / 60, digits = 2), " min.")
    
    ### SAVE R IMAGE ###
    save.image(outImage)
    #save(tumour_copy, results, loglik, file=paste0(outDir,"/",id,".RData"))
    
    ### SELECT SOLUTION WITH LARGEST LIKELIHOOD ###
    loglik <- loglik[!is.na(loglik$init), ]
    if (estimateScPrevalence){ ## sort but excluding solutions with too large % subclonal
    	fracInd <- which(loglik[, "Frac_CNA_subclonal"] <= maxFracCNASubclone &
    						 		   loglik[, "Frac_genome_subclonal"] <= maxFracGenomeSubclone)
    	if (length(fracInd) > 0){ ## if there is a solution satisfying % subclonal
    		ind <- fracInd[order(loglik[fracInd, "loglik"], decreasing=TRUE)]
    	}else{ # otherwise just take largest likelihood
    		ind <- order(as.numeric(loglik[, "loglik"]), decreasing=TRUE)
    	}
    }else{#sort by likelihood only
      ind <- order(as.numeric(loglik[, "loglik"]), decreasing=TRUE)
    }
    
    #new loop by order of solutions (ind)
    outPlotFile <- paste0(outDir, "/", id, "/", id, "_genomeWide_all_sols")
    for(i in 1:length(ind)) {
      hmmResults.cor <- results[[ind[i]]]
      turnDevOff <- FALSE
      turnDevOn <- FALSE
      if (i == 1){
      	turnDevOn <- TRUE
      }
      if (i == length(ind)){
      	turnDevOff <- TRUE
      }
      plotGWSolution(hmmResults.cor, s=s, outPlotFile=outPlotFile, plotFileType="pdf",
                         logR.column = "logR", call.column = "Corrected_Call",
                         plotYLim=plotYLim, estimateScPrevalence=estimateScPrevalence,
                         seqinfo = seqinfo,
                         turnDevOn = turnDevOn, turnDevOff = turnDevOff, main=mainName[ind[i]])
    }
    
    hmmResults.cor <- results[[ind[1]]]
    hmmResults.cor$results$loglik <- as.data.frame(loglik)
    hmmResults.cor$results$gender <- gender$gender
    hmmResults.cor$results$chrYCov <- gender$chrYCovRatio
    hmmResults.cor$results$chrXMedian <- gender$chrXMedian
    hmmResults.cor$results$coverage <- coverage
    
    outputHMM(cna = hmmResults.cor$cna, segs = hmmResults.cor$results$segs,
                          results = hmmResults.cor$results, patientID = patientID, outDir=outDir)
    outFile <- paste0(outDir, "/", patientID, ".params.txt")
    outputParametersToFile(hmmResults.cor, file = outFile)
    
        

Hold and test

Ideas
Filter fragments by length
  • Snakemake
    # Filter fragments by length
    rule cna_frag_filt_tmp:
        benchmark: benchdir + "/{library}_{frag_distro}_cfdna_wgs_frag_filt.benchmark.txt",
        container: cfdna_wgs_container,
        input: cfdna_wgs_cna_in_bams + "/{library}.bam",
        log: logdir + "/{library}_{frag_distro}_cfdna_wgs_frag_filt.log",
        output:
            nohead = temp(cfdna_wgs_cna_frag_bams) + "/{library}_frag{frag_distro}.nohead",
            onlyhead = temp(cfdna_wgs_cna_frag_bams) + "/{library}_frag{frag_distro}.only",
            final = cfdna_wgs_cna_frag_bams + "/{library}_frag{frag_distro}.bam",
        params:
            script = cfdna_wgs_scriptdir + "/frag_filt.sh",
            threads = cfdna_wgs_threads,
        shell:
            """
            frag_min=$(echo {wildcards.frag_distro} | sed -e "s/_.*$//g")
            frag_max=$(echo {wildcards.frag_distro} | sed -e "s/^.*_//g")
            {params.script} \
            {input} \
            {output.nohead} \
            $frag_min \
            $frag_max \
            {config[threads]} \
            {output.onlyhead} \
            {output.final}
            """
        

Reference

Ideas

Multiple frag distros

note- this would require re-write of ds_cond_target_list

Testing inputs update

  • Make a smaller fasta for indexing
    #!/bin/echo For documentation, not intended to be executable:.
    singularity shell ~/sing_containers/biotools.1.0.2.sif
    repo=/home/jeszyman/repos/cfdna-wgs
    wget --directory-prefix="${repo}/test/inputs/" https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
    
    zcat "test/inputs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz" | grep -A 1000000 chr8 | gzip > test/inputs/chr8.fa.gz
    
    # Test indexed size
    mkdir -p /tmp/testbwa
    bwa index -p /tmp/testbwa/chr8 test/inputs/chr8.fa.gz
    
    rm ${repo}/test/inputs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
    
        
  • Make chr8-specific ichor references
    singularity shell ~/sing_containers/cfdna_wgs.1.0.0.sif
    
    ~/wigToBigWig -clip /opt/ichorCNA/inst/extdata/gc_hg38_1000kb.wig test/inputs/hg38.chrom.sizes test/inputs/gc_chr8_1000kb.bw
    
    bigWigToWig -chrom=chr8 test/inputs/gc_chr8_1000kb.bw test/inputs/gc_chr8_1000kb.wig
    
    ~/wigToBigWig -clip /opt/ichorCNA/inst/extdata/map_hg38_1000kb.wig test/inputs/hg38.chrom.sizes test/inputs/map_chr8_1000kb.bw
    
    bigWigToWig -chrom=chr8 test/inputs/map_chr8_1000kb.bw test/inputs/map_chr8_1000kb.wig
    
    ~/wigToBigWig -clip /opt/ichorCNA/inst/extdata/gc_hg38_1000kb.wig test/inputs/hg38.chrom.sizes test/inputs/gc_chr8_1000kb.bw
    
    bigWigToWig -chrom=chr8 test/inputs/gc_chr8_1000kb.bw test/inputs/gc_chr8_1000kb.wig
    
        
  • ?
    wget --directory-prefix="/home/jeszyman/repos/cfdna-wgs/test/inputs" https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.chrom.sizes
    
    wget --directory-prefix="/home/jeszyman/repos/cfdna-wgs/test/inputs" https://raw.githubusercontent.com/Boyle-Lab/Blacklist/master/lists/hg38-blacklist.v2.bed.gz
    
    gunzip -c ~/repos/cfdna-wgs/test/inputs/hg38-blacklist.v2.bed.gz > ~/repos/cfdna-wgs/test/inputs/hg38-blacklist.v2.bed
    
        
    singularity shell --bind /mnt ~/sing_containers/cfdna_wgs.1.0.0.sif
    
    # Clear bam directory if present
    if [ -r test/bam ]; then \rm -rf test/bam; fi
    mkdir -p test/bam
    
    # Create small bam files to store in repo. Subsample real bams to ~100 Mb.
    sambamba view -s .005 -f bam -t 36 /mnt/ris/aadel/Active/mpnst/test/bam/new_HiSeq15_L002001_ACAC_extract_ds20.bam > test/inputs/lib003_hg38.bam
    sambamba view -s .005 -f bam -t 36 /mnt/ris/aadel/Active/mpnst/test/bam/new_HiSeq15_L002001_ATCG_extract_ds20.bam > test/inputs/lib004_hg38.bam
    sambamba view -s 0.01 -f bam -t 4 /mnt/ris/aadel/Active/mpnst/bam/cfdna_wgs/ds/lib105_ds10.bam > test/inputs/lib005.bam
    sambamba view -s 0.01 -f bam -t 4 /mnt/ris/aadel/Active/mpnst/bam/cfdna_wgs/ds/lib205_ds10.bam > test/inputs/lib006.bam
    
    for file in test/inputs/*.bam; do samtools index $file; done
    
        
    mkdir -p ~/repos/cfdna-wgs/test/analysis/cfdna_frag_bams
    cp ~/repos/cfdna-frag/test/bam/frag/*.bam ~/repos/cfdna-wgs/test/analysis/cfdna_frag_bams/
        
    singularity shell --bind /mnt ~/sing_containers/cfdna_wgs.1.0.0.sif
    
    # Get hg38 gc bigwig
    wget --directory-prefix /tmp/ http://hgdownload.cse.ucsc.edu/gbdb/hg38/bbi/gc5BaseBw/gc5Base.bw
    
    # convert hg38 gc bigwig to tsv binned at 5 Mb (like Mathios, 2021)
    multiBigwigSummary bins \
        --binSize 5000000 \
        --bwfiles /tmp/gc5Base.bw \
        --numberOfProcessors 4 \
        --outFileName /tmp/test.out \
        --outRawCounts /tmp/gc5mb.tsv
    
    tail -n +2 /tmp/gc5mb.tsv > test/inputs/gc5mb.bed
    
    
    
    
        
    • bedtools subtract -a “test/inputs/chr8.bed” -b “test/inputs/hg38-blacklist.v2.bed” > “test/inputs/keep.bed”

Make example fastq more efficient-

  • Find reads that map to early 8 and work back into a fastq

filter CNA abberant regions

  • cite:dehner2021

Check / explain - Prerequisites for local integration testing

update /simplify aggregate qc table

expand seq depth metrics

  • using mosdepth
    #########1#########2#########3#########4#########5#########6#########7#########8
    #
    ### mosdepth for WGS depth calc  ###
    #
    # Setup
    ##
    
    # Mosdepth per bam dir
    ##
    ## For deduped bams
    for file in $localdata/bams/*.dedup.sorted.bam; do
        mosdepth_mpnst $file $localdata/bam-qc/dedup 250000000
    done
    ##
    #
    # get simple tsv and send to repo
    
    for file in $localdata/bam-qc/dedup/lib*.regions.bed.gz; do
        base=`basename -s .dedup.sorted.regions.bed.gz $file`
        zcat $file | awk -v FS='\t' -v var=$base 'NR <=24 {print var,$1,$4}' >> $localdata/bam-qc/dedup/all_dedup_coverage
    done
    
    header=library_id\\tchr\\tmean_coverage
    sed -i "1 i$header" $localdata/bam-qc/dedup/all_dedup_coverage
    
    ## Local
    >>>>>>> 2d6bf2d62424a76f5893600fce7444a867784228
    source ~/repos/mpnst/bin/local-setup.sh
    docker_interactive
    biotools
    ##
    ## Functions
    ###
    ### Convert bams to wigs
    bam_to_wig() {
        printf "Variables are: 1=bam_file 2=bam_suffix 3=outdir\n"
            base=`basename -s ${2} $1`
            if [ $3/${base}.wig -ot $1 ]; then
                /opt/hmmcopy_utils/bin/readCounter --window 1000000 --quality 20 \
                                                   --chromosome "chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY" $1 > $3/${base}.wig
            fi
    }
    ###
    ### Run ichor for low TF
    ichor_lowfract() {
        base=`basename -s .wig $1`
        if [ $2/$base.RData -ot $1 ]; then
            Rscript /opt/ichorCNA/scripts/runIchorCNA.R \
                    --id $base \
                    --WIG $1 \
                    --gcWig /opt/ichorCNA/inst/extdata/gc_hg19_1000kb.wig \
                    --normal "c(0.95, 0.99, 0.995, 0.999)" \
                    --ploidy "c(2)" \
                    --maxCN 3 \
                    --estimateScPrevalence FALSE \
                    --scStates "c()" \
                    --outDir $2
        fi
    }
    ##
    ##
    mkdir -p $localdata/wigs
    mkdir -p $localdata/ichor
    #
    # Make wigs
    #
    #bam_to_wig /mnt/xt3/mpnst/frag-filt-bams/lib109.dedup.sorted.frag90_150.sorted.bam .dedup.sorted.frag90_150.sorted.bam $localdata/wigs
    ##
    for file in $localdata/frag-filt-bams/lib109*.bam; do
        bam_to_wig $file \
                   .dedup.sorted.frag.sorted.bam \
                   $localdata/wigs
    done
    
    ## For fraction-filtered WGS cfDNA
    for file in $localdata/frag-filt-bams/*.bam; do
        bam_to_wig $file \
                   .dedup.sorted.frag.sorted.bam \
                   $localdata/wigs
    done
    ##
    ## For tumor and leukocyte WGS libraries
    ### Make array of genomic library file paths
    genomic=($(cat /drive3/users/jszymanski/repos/mpnst/data/libraries.csv | grep -e tumor -e leukocyte | grep -v "wes" | awk -F, '{print $1}' | sed 's/"//g' | sed 's/$/.dedup.sorted.bam/g' | sed 's/^/\/mnt\/xt3\/mpnst\/bams\//g'))
    ###
    for file in ${genomic[@]}; do
        bam_to_wig $file \
                   .dedup.sorted.bam \
                   $localdata/wigs
    done
    #
    ##
    ## Send successful file list to repo
    rm /drive3/users/jszymanski/repos/mpnst/data/wigs.tsv
    for file in $localdata/wigs/*.wig;
    do
        base=`basename -s .wig $file`
        echo $base >> /drive3/users/jszymanski/repos/mpnst/data/wigs.tsv
    done
    #
    ##RESUME HERE
    # ichor
    ##
    for file in $localdata/wigs/lib109*.wig; do
        ichor_lowfract $file $localdata/ichor
    done
    
    
    header=library_id\\tchr\\tmean_coverage
    sed -i "1 i$header" $localdata/bam-qc/dedup/all_dedup_coverage
    
    max_file_size=5000000
    file_size=$(
        wc -c <"$localdata/bam-qc/dedup/all_dedup_coverage"
             )
    
    if [ $filesize -gt $max_file_size ]; then
        touch $repo/data/qc/all_dedup_coverage_too_big
    else
        cp $localdata/bam-qc/dedup/all_dedup_coverage $repo/qc/all_dedup_coverage.tsv
    fi
    #
        

make consolidated per-cna file (see $datadir/old/all_cna.bed)

Bam downsampling multiple downsampling sizes

Downsample bams

  • Snakemake
    # Downsample bam file to a set number of reads
    rule cfdna_wgs_downsample:
        benchmark: benchdir + "/{library}_{milreads}_cfdna_wgs_downsample.benchmark.txt",
        container: cfdna_wgs_container,
        input: cfdna_wgs_bams + "/{library}_filt.bam",
        log: logdir + "/{library}_{milreads}_cfdna_wgs_downsample.log",
        output: cfdna_wgs_bams + "/{library}_ds{milreads}.bam",
        params:
            milreads = MILREADS,
            script = cfdna_wgs_scriptdir + "/downsample.sh",
            threads = cfdna_wgs_threads,
        shell:
            """
            {params.script} \
            {input} \
            {wildcards.milreads} \
            {output} &> {log}
            """
        
  • Shell script
    #!/usr/bin/env bash
    
    # For unit testing
    # in_bam="test/analysis/cfdna_wgs_bams/lib001_filt.bam"
    # out_bam=/tmp/test.bam
    # milreads="0.0041"
    
    in_bam=$1
    milreads="$2"
    out_bam=$3
    
    reads=$(echo |awk -v var1=$milreads '{ print 1000000*var1 }')
    
    ## Calculate the sampling factor based on the intended number of reads:
    
    FACTOR=$(samtools idxstats $in_bam | cut -f3 | awk -v COUNT=$reads 'BEGIN {total=0} {total += $1} END {print COUNT/total}')
    
    if [[ $FACTOR > 1 ]]; then
        echo "DS reads exceeds total for $in_bam"
    else
        sambamba view -s $FACTOR -f bam -l 5 $in_bam > $out_bam
    fi
    
        

Reference

Bioinformatics project module

Biotools headline

Old rules

Alignment processing

# Alignment deduplication and sorting
rule alignment_processing:
    input:
        config["datadir"] + "/bam/{library_id}_raw.bam",
    output:
        dedup = temp(config["datadir"] + "/bam/{library_id}_dedup_unsort.bam"),
        sort = config["datadir"] + "/bam/{library_id}_dedup.bam",
        index = config["datadir"] + "/bam/{library_id}_dedup.bam.bai",
    log:
        config["datadir"] + "/logs/alignment_processing_{library_id}.log"
    shell:
        """
        {config[cfdna_wgs_script_dir]}/alignment_processing.sh \
        {input} \
        {config[threads]} \
        {output.bam} \
        {output.dedup} \
        {output.sort} \
        {output.index} \
        &> {log}
        """
  • Script
    #!/usr/bin/env bash
    
    <#bash_preamble#>
    
    input=$1
    threads=$2
    output_bam=$3
    output_dedup=$4
    output_sort=$5
    output_index=$6
    
    sambamba view -t $threads -S -f bam $input > $output_bam
    sambamba markdup -r -t $threads $output_bam $output_dedup
    sambamba sort -t $threads $output_dedup -o $output_sort
    sambamba index -t $threads $output_sort