-
Notifications
You must be signed in to change notification settings - Fork 4
/
dbsnp_annotate_one.sh
executable file
·138 lines (114 loc) · 3.65 KB
/
dbsnp_annotate_one.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/bin/bash
readonly DB_PATH=${USE_DB_PATH:-"/dbs/annotation_databases"}
readonly EXECUTABLE_PATH=${USE_EXECUTABLE_PATH:-"/usr/local/bin"}
readonly CODINGPATH="${DB_PATH}/CosmicCodingMuts.vcf.gz"
readonly NONCODINGPATH="${DB_PATH}/CosmicNonCodingVariants.vcf.gz"
if [[ -f "${CODINGPATH}" ]]
then
readonly cosmic_coding="$CODINGPATH"
fi
if [[ -f "${NONCODINGPATH}" ]]
then
readonly cosmic_non_coding="$CODINGPATH"
fi
function usage {
>&2 echo "usage: $0 input.vcf.gz snv|indel output_file"
>&2 echo " annotates one merged tumor"
exit 1
}
readonly input_file="$1"
readonly variant_type="$2"
readonly output_file="$3"
if [[ -z "$input_file" ]] || [[ -z "$output_file" ]]
then
>&2 echo "argument missing"
>&2 echo "invocation: $0 \"$1\" \"$2\""
usage
fi
if [[ ! -f "$input_file" ]]
then
>&2 echo "file missing: ${input_file} not found"
usage
fi
if [[ -z "$variant_type" ]]
then
>&2 echo "Variant type missing"
usage
fi
if [[ "$variant_type" != "snv" ]] && [[ "$variant_type" != "indel" ]]
then
>&2 echo "Invalid variant type $variant_type"
>&2 echo "Valid variant types: snv indel"
usage
fi
if [[ ! -d "$DB_PATH" ]]
then
>&2 echo "database directory missing: ${DB_PATH} not found"
usage
fi
if [[ "$variant_type" == "indel" && ( -z "$cosmic_coding" || -z "$cosmic_non_coding" ) ]]
then
>&2 echo "Error: Indel consensus calling requires cosmic VCFs"
>&2 echo "Run download cosmic before merging indels"
usage
fi
if [[ -f "${output_file}.gz" ]] && [[ "${output_file}.gz" -nt "$input_file" ]]
then
>&2 echo "$0: ${output_file} exists and is newer than inputs; cowardly refusing to overwrite."
exit 1
fi
function fix_vcfanno_header {
sed \
-e 's/^##INFO=<ID=1000genomes_AF.*$/##INFO=<ID=1000genomes_AF,Number=1,Type=Float,Description="Thousand Genomes phase 3 occurance fraction if found: ALL.wgs.phase3_shapeit2_mvncall_integrated_v5b.20130502.sites.vcf.gz">/' \
-e 's/^##INFO=<ID=1000genomes_ID.*$/##INFO=<ID=1000genomes_ID,Number=1,Type=String,Description="Thousand Genomes phase 3 ID if found: ALL.wgs.phase3_shapeit2_mvncall_integrated_v5b.20130502.sites.vcf.gz">/' \
-e 's/^##INFO=<ID=cosmic,.*$/##INFO=<ID=cosmic,Number=1,Type=String,Description="(first) cosmic ID if found, COSMICv76">/' \
-e 's/^##INFO=<ID=dbsnp,.*$/##INFO=<ID=dbsnp,Number=1,Type=String,Description="(first) dbSNP ID if found, build 147, All_20160408.vcf.gz">/' \
-e 's/^##INFO=<ID=repeat_masker,.*$/##INFO=<ID=repeat_masker,Number=1,Type=String,Description="Repeat masker region if in one">/'
}
readonly DBSNP_ANNOTATIONS=/tmp/dbsnp_annotations.conf
rm -f $DBSNP_ANNOTATIONS
cat > $DBSNP_ANNOTATIONS <<EOF
[[annotation]]
file = "${DB_PATH}/All_20160601.vcf.gz"
fields = ["ID", "VP"]
names = ["dbsnp", "dbsnp_VP"]
ops = ["first", "first"]
[[annotation]]
file = "${DB_PATH}/hg19.rmsk.bed.gz"
columns = [4]
names = ["repeat_masker"]
ops = ["first"]
[[annotation]]
file = "${DB_PATH}/1000genomes.phase3.decomposed.normalized.vcf.gz"
fields = ["ID", "AF"]
names = ["1000genomes_ID", "1000genomes_AF"]
ops = ["self", "self"]
EOF
for cosmicfile in "$cosmic_coding" "$cosmic_non_coding"
do
if [[ -f "$cosmicfile" ]]
then
cat >> $DBSNP_ANNOTATIONS <<EOF
[[annotation]]
file = "${cosmicfile}"
fields = ["ID"]
names = ["cosmic"]
ops = ["first"]
EOF
fi
done
function cleancalls {
if [[ $variant_type == "snv" ]]
then
"${EXECUTABLE_PATH}/clean_snv_calls.py"
else
cat
fi
}
vcfanno $DBSNP_ANNOTATIONS "${input_file}" \
| fix_vcfanno_header \
| cleancalls \
> "${output_file}"
bgzip "${output_file}"
tabix -p vcf "${output_file}.gz"
rm $DBSNP_ANNOTATIONS