-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrun.sh
executable file
·167 lines (117 loc) · 4.65 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/bin/bash
set -o pipefail
set -e
# send all output to a file as well (see closing brace at the bottom)
{
set -e # exit on error
set -o pipefail
set -x
public_hostname=$(curl -s http://169.254.169.254/latest/meta-data/public-hostname || true)
echo public hostname for this container is $public_hostname
container_id=$(cat /proc/self/cgroup | head -n 1 | cut -d '/' -f4)
echo container id is $container_id
aws s3 cp $ACCESSION_LIST accessionlist.txt
aws configure set default.s3.multipart_chunksize 50MB
aws configure set default.s3.max_concurrent_requests 100
aws configure set default.s3.max_queue_size 10000
aws configure set default.s3.multipart_threshold 64MB
if [ -z $NUM_CORES ]; then
echo NUM_CORES is not set, exiting
exit 1
else
echo NUM_CORES is set to $NUM_CORES
fi
if [[ -v AWS_BATCH_JOB_ID ]]
then
echo this is an aws batch job
rm -rf ~/ncbi
if [[ -v AWS_BATCH_JOB_ARRAY_INDEX ]]
then
echo this is an array job
line="$((AWS_BATCH_JOB_ARRAY_INDEX + 1))"
SRA_ACCESSION=$(sed "${line}q;d" accessionlist.txt)
scratch=/scratch/$AWS_BATCH_JOB_ID/$AWS_BATCH_JOB_ARRAY_INDEX/
else
echo this is not an array job
SRA_ACCESSION=$(sed '1q;d' accessionlist.txt)
scratch=/scratch/$AWS_BATCH_JOB_ID/
fi
mkdir -p $scratch
ln -s $scratch ~/ncbi
mkdir -p ~/ncbi/dbGaP-17102
else
echo this is not an aws batch job
SRA_ACCESSION=$(sed '1q;d' accessionlist.txt)
scratch=.
mkdir -p ~/ncbi/dbGaP-17102
fi
cd ~/ncbi/dbGaP-17102
PTMP=tmp
mkdir -p $PTMP
rm -rf $PTMP/*
echo SRA_ACCESSION is $SRA_ACCESSION
echo scratch is $scratch
# echo get size of $SRA_ACCESSION ...
# prefetch -s $SRA_ACCESSION
# interval=$(RANDOM=$$ shuf -i 0-60 -n 1)
# echo sleeping $interval minutes before download to avoid slamming SRA....
# sleep ${interval}m
# echo downloading $SRA_ACCESSION from sra...
# if [ -f ~/ncbi/dbGaP-17102/sra/$SRA_ACCESSION.sra ]; then
# echo SRA file already exists, skipping download
# else
# if prefetch --transport http --max-size 100000000000 $SRA_ACCESSION ; then
# echo finished downloading, prefetch exited with result code 0
# else
# result=$?
# echo prefetch exited with nonzero result code $result, cleaning up and exiting...
# rm -f ~/ncbi/dbGaP-17102/sra/$SRA_ACCESSION.sra
# rm -f ~/ncbi/public/sra/* ~/ncbi/public/refseq/*
# exit $result
# fi
# fi
fastq_url=s3://$BUCKET_NAME/pipeline-fastq-salivary/$SRA_ACCESSION/$SRA_ACCESSION.fastq.gz
# echo streaming fastq-dump output to s3...
#
# time (fastq-dump -Z ~/ncbi/dbGaP-17102/sra/$SRA_ACCESSION.sra | pv -i 59 -N fastq-dump |gzip| pv -i 59 -N gzip | aws s3 cp - $fastq_url)
# ( downloads to ~/ncbi/public/sra/)
# echo running fastq-dump
# time parallel-fastq-dump --sra-id sra/$SRA_ACCESSION.sra --threads $NUM_CORES --outdir . --gzip --split-files -W -I --tmpdir $PTMP
# echo "done with fastq-dump, copying fastqs to s3"
# aws s3 cp ${SRA_ACCESSION}_1.fastq.gz s3://$BUCKET_NAME/pipeline-fastq-salivary/$SRA_ACCESSION/
# aws s3 cp ${SRA_ACCESSION}_2.fastq.gz s3://$BUCKET_NAME/pipeline-fastq-salivary/$SRA_ACCESSION/
# viruses=( hhv6a hhv6b hhv-7 gapdhpolyAtrimmed )
viruses=$(echo $REFERENCES | tr "," "\n")
# viruses=( hhv6a_u1102_untrimmed hhv6b_z29_untrimmed hhv-7 gapdhpolyAtrimmed )
echo starting pipeline...
echo getting fastqs from s3...
# aws s3 cp s3://$BUCKET_NAME/pipeline-fastq/$SRA_ACCESSION/${SRA_ACCESSION}_1.fastq.gz .
# aws s3 cp s3://$BUCKET_NAME/pipeline-fastq/$SRA_ACCESSION/${SRA_ACCESSION}_2.fastq.gz .
set +e
aws s3 cp s3://$BUCKET_NAME/pipeline-fastq/$SRA_ACCESSION/${SRA_ACCESSION}_1.fastq.gz .
aws s3 cp s3://$BUCKET_NAME/pipeline-fastq/$SRA_ACCESSION/${SRA_ACCESSION}_2.fastq.gz .
aws s3 cp s3://$BUCKET_NAME/pipeline-fastq-salivary/$SRA_ACCESSION/${SRA_ACCESSION}_1.fastq.gz .
aws s3 cp s3://$BUCKET_NAME/pipeline-fastq-salivary/$SRA_ACCESSION/${SRA_ACCESSION}_2.fastq.gz .
set -e
# for virus in "${viruses[@]}"; do
for virus in $viruses; do
# virus="betaglobincds"
echo processing $virus ...
if aws s3api head-object --bucket $BUCKET_NAME --key $PREFIX/$SRA_ACCESSION/$virus/$SRA_ACCESSION.sam &> /dev/null; then
echo output file already exists in S3, skipping....
else
time bowtie2 --local -p $NUM_CORES --no-unal -1 ${SRA_ACCESSION}_1.fastq.gz -2 ${SRA_ACCESSION}_2.fastq.gz -x /bt2/$virus | \
pv -i 31 -f -N "bowtie2 $virus" | \
aws s3 cp - s3://$BUCKET_NAME/$PREFIX/$SRA_ACCESSION/$virus/$SRA_ACCESSION.sam
fi
done
echo done with pipeline, cleaning up
# echo removing fastq file from s3...
# aws s3 rm $fastq_url
echo removing scratch...
if [[ -v AWS_BATCH_JOB_ID ]]
then
rm -rf $scratch
fi
echo exiting...
} 2>&1 | tee /tmp/batch.log