forked from bmvdgeijn/WASP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
example_cht_workflow.sh
180 lines (152 loc) · 6.59 KB
/
example_cht_workflow.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#
# Convert SNP files to HDF5 format. This program can be used
# on output from impute2 or on VCF files
#
./snp2h5/snp2h5 --chrom example_data/chromInfo.hg19.txt \
--format impute \
--geno_prob example_data/geno_probs.h5 \
--snp_index example_data/snp_index.h5 \
--snp_tab example_data/snp_tab.h5 \
--haplotype example_data/haps.h5 \
example_data/genotypes/chr*.hg19.impute2.gz \
example_data/genotypes/chr*.hg19.impute2_haps.gz
#
# Convert FASTA files to HDF5 format.
# Note the HDF5 sequence files are only used for GC content
# correction part of CHT. This step can be ommitted if
# GC-content correction is not used.
#
./snp2h5/fasta2h5 --chrom example_data/chromInfo.hg19.txt \
--seq example_data/seq.h5 \
/data/external_public/reference_genomes/hg19/chr*.fa.gz
# loop over all individuals in samples file
H3K27AC_SAMPLES_FILE=example_data/H3K27ac/samples.txt
ALL_SAMPLES_FILE=example_data/genotypes/YRI_samples.txt
for INDIVIDUAL in $(cat $H3K27AC_SAMPLES_FILE)
do
echo $INDIVIDUAL
#
# read BAM files for this individual and write read counts to
# HDF5 files
#
python CHT/bam2h5.py --chrom example_data/chromInfo.hg19.txt \
--snp_index example_data/snp_index.h5 \
--snp_tab example_data/snp_tab.h5 \
--haplotype example_data/haps.h5 \
--samples $ALL_SAMPLES_FILE \
--individual $INDIVIDUAL \
--ref_as_counts example_data/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \
--alt_as_counts example_data/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \
--other_as_counts example_data/H3K27ac/other_as_counts.$INDIVIDUAL.h5 \
--read_counts example_data/H3K27ac/read_counts.$INDIVIDUAL.h5 \
example_data/H3K27ac/$INDIVIDUAL.chr*.keep.rmdup.bam
done
#
# Make a list of target regions in ChIP-seq peaks and associated SNPs
# to test with the CHT (written to chr22.peaks.txt.gz). The provided
# get_target_regions.py script can be used to identify target regions
# and test SNPs that match specific criteria, (e.g. minimum number of
# heterozygous individuals and total number of allele-specific reads
# in target region). The file containing target regions and test SNPs can
# also be generated by the user (for example if a specific set of
# target regions and test SNPs are to be tested).
#
python CHT/get_target_regions.py \
--target_region_size 2000 \
--min_as_count 10 \
--min_read_count 100 \
--min_het_count 1 \
--min_minor_allele_count 1\
--chrom example_data/chromInfo.hg19.txt \
--read_count_dir example_data/H3K27ac \
--individuals $H3K27AC_SAMPLES_FILE \
--samples $ALL_SAMPLES_FILE \
--snp_tab example_data/snp_tab.h5 \
--snp_index example_data/snp_index.h5 \
--haplotype example_data/haps.h5 \
--output_file example_data/H3K27ac/chr22.peaks.txt.gz
for INDIVIDUAL in $(cat $H3K27AC_SAMPLES_FILE)
#
# create CHT input file for this individual
#
python CHT/extract_haplotype_read_counts.py \
--chrom example_data/chromInfo.hg19.txt \
--snp_index example_data/snp_index.h5 \
--snp_tab example_data/snp_tab.h5 \
--geno_prob example_data/geno_probs.h5 \
--haplotype example_data/haps.h5 \
--samples $ALL_SAMPLES_FILE \
--individual $INDIVIDUAL \
--ref_as_counts example_data/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \
--alt_as_counts example_data/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \
--other_as_counts example_data/H3K27ac/other_as_counts.$INDIVIDUAL.h5 \
--read_counts example_data/H3K27ac/read_counts.$INDIVIDUAL.h5 \
example_data/H3K27ac/chr22.peaks.txt.gz \
| gzip > example_data/H3K27ac/haplotype_read_counts.$INDIVIDUAL.txt.gz
done
#
# Adjust read counts in CHT files by modeling
# relationship between read depth and GC content & peakiness
# in each sample.
# (first make files containing lists of input and output files)
#
IN_FILE=example_data/H3K27ac/input_files.txt
OUT_FILE=example_data/H3K27ac/output_files.txt
ls example_data/H3K27ac/haplotype_read_counts* | grep -v adjusted > $IN_FILE
cat $IN_FILE | sed 's/.txt/.adjusted.txt/' > $OUT_FILE
python CHT/update_total_depth.py --seq example_data/seq.h5 $IN_FILE $OUT_FILE
#
# Adjust heterozygote probabilities in CHT files to account for
# possible genotyping errors. Total counts of reference and
# alternative alleles are used to adjust the probability. In
# this example we just provide the same H3K27ac read counts, however
# you could also use read counts combined across many different
# experiments or (perhaps ideally) from DNA sequencing.
#
for INDIVIDUAL in $(cat $H3K27AC_SAMPLES_FILE)
do
IN_FILE=example_data/H3K27ac/haplotype_read_counts.$INDIVIDUAL.adjusted.txt.gz
OUT_FILE=example_data/H3K27ac/haplotype_read_counts.$INDIVIDUAL.adjusted.hetp.txt.gz
python CHT/update_het_probs.py \
--ref_as_counts example_data/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \
--alt_as_counts example_data/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \
$IN_FILE $OUT_FILE
done
CHT_IN_FILE=example_data/H3K27ac/cht_input_file.txt
ls example_data/H3K27ac/haplotype_read_counts*.adjusted.hetp.txt.gz > $CHT_IN_FILE
#
# Estimate overdispersion parameters for allele-specific test (beta binomial)
#
OUT_FILE=example_data/H3K27ac/cht_as_coef.txt
python CHT/fit_as_coefficients.py $CHT_IN_FILE $OUT_FILE
#
# Estimate overdispersion parameters for association test (beta-negative binomial)
#
OUT_FILE=example_data/H3K27ac/cht_bnb_coef.txt
python CHT/fit_bnb_coefficients.py --min_counts 50 --min_as_counts 10 $CHT_IN_FILE $OUT_FILE
#
# run combined haplotype test
#
OUT_FILE=example_data/H3K27ac/cht_results.txt
python CHT/combined_test.py --min_as_counts 10 \
--bnb_disp example_data/H3K27ac/cht_bnb_coef.txt \
--as_disp example_data/H3K27ac/cht_as_coef.txt \
$CHT_IN_FILE $OUT_FILE
#
# Optionally, principcal component loadings can be used as covariates
# by the CHT. An example of how to perform PCA and obtain principal
# component loadings is provided in the file example_data/H3K27ac/get_PCs.R
# Note that we only recommend using PCs as covariates in when sample
# sizes are fairly large (e.g. > 30 individuals).
#
# Example of how to get PC loadings
# Rscript example_data/H3K27ac/get_PCs.R > example_data/H3K27ac/pcs.txt
#
# Using the first 2 PC loadings in the CHT:
# OUT_FILE=example_data/H3K27ac/cht_results.PCs.txt
# python CHT/combined_test.py --min_as_counts 10 \
# --bnb_disp example_data/H3K27ac/cht_bnb_coef.txt \
# --as_disp example_data/H3K27ac/cht_as_coef.txt \
# --num_pcs 2 --pc_file example_data/H3K27ac/pcs.txt \
# $CHT_IN_FILE $OUT_FILE
#