-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHOCKING-datatable-foverlaps.tex
543 lines (425 loc) · 15.2 KB
/
HOCKING-datatable-foverlaps.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
\documentclass{beamer}
\usepackage{tikz}
\usepackage[all]{xy}
\usepackage{listings}
\usepackage{slashbox}
%\usepackage{booktabs}
\usepackage{amsmath,amssymb}
\usepackage{hyperref}
\usepackage{graphicx}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\Lik}{Lik}
\DeclareMathOperator*{\Peaks}{Peaks}
\DeclareMathOperator*{\Segments}{Segments}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\maximize}{maximize}
\DeclareMathOperator*{\minimize}{minimize}
\newcommand{\sign}{\operatorname{sign}}
\newcommand{\RR}{\mathbb R}
\newcommand{\ZZ}{\mathbb Z}
\newcommand{\NN}{\mathbb N}
% Set transparency of non-highlighted sections in the table of
% contents slide.
\setbeamertemplate{section in toc shaded}[default][100]
\AtBeginSection[]
{
\setbeamercolor{section in toc}{fg=red}
\setbeamercolor{section in toc shaded}{fg=black}
\begin{frame}
\tableofcontents[currentsection]
\end{frame}
}
\begin{document}
\title{Overlap Death Match!}
\author{
Toby Dylan Hocking\\
toby.hocking@mail.mcgill.ca
\vskip 0.5cm
\parbox{2cm}{
\includegraphics[height=2.5cm]{bedtools}
}
VS
\parbox{2cm}{
\centering
\includegraphics[height=2cm]{bioconductor}
\\
Bioconductor
}
VS
\parbox{2cm}{
\centering
\includegraphics[height=2cm]{datatable}
\\
data.table
}
% Main reference: data.table package on CRAN.\\
% { \url{http://cran.rstudio.com/web/packages/data.table}}
% (see vignettes datatable-intro.pdf and datatable-faq.pdf).
}
\date{12 February 2015}
\maketitle
\section{The contenders}
\begin{frame}
\frametitle{Brief history of statistical computing}
\begin{description}
\item[1957] FORTRAN by John Backus (IBM).
%\item[1966] SAS (North Carolina State Univ).
\item[1972] C by Dennis Ritchie (Bell Labs).
\item[1976] S by John M Chambers (Bell Labs).
\item[1983] C++ by Bjarne Stroustrup (Bell Labs).
%\item[1988] ``New S'' much rewritten in C.
\item[1993] S exclusively licensed to StatSci/MathSoft.\\
R by Ross Ihaka and Robert Gentleman\\
(Univ Auckland, New Zealand).
\item[1998] Association of Computing Machinery Software System award
to John M Chambers for ``the S system, which has forever altered
the way people analyze, visualize, and manipulate data.''
\end{description}
\vskip 0.5cm Source: Wikipedia, R-FAQ.
\end{frame}
\begin{frame}
\frametitle{R = interactive, graphical, programming with data}
What is R? (Source: R-FAQ)
``R is a system for statistical computation and graphics. It
consists of a language plus a run-time environment with graphics, a
debugger, access to certain system functions, and the ability to run
programs stored in script files.''
\begin{description}
\item[interactive] command line (versus compiled).
\item[graphical] publication-quality plots.
\item[programming with data] \texttt{data.frame} which represents a
tabular data set.
\end{description}
\end{frame}
\begin{frame}
\frametitle{Selected Bioconductor project history}
\begin{tabular}{rrrll}
Version & Release & Packages & Depends & Firsts/notes \\
\hline
1.0 & 1 May 2001 & 15 & R 1.5 \\
%\vdots & \vdots & \vdots & \vdots \\
2.3 & 22 Oct 2008 & 294 & R 2.8 & IRanges\\
2.5 & 28 Oct 2009 & 352 & R 2.10 & findOverlaps\\
2.6 & 23 Apr 2010 & 389 & R 2.11 & GenomicRanges\\
3.0 & 14 Oct 2014 & 934 & R 3.1 & TESTED
\end{tabular}
\includegraphics[height=2cm]{bioconductor} Bioconductor = R packages
with compiled C code.
\vskip 1cm
Source: Wikipedia ``Bioconductor,'' archived packages e.g.
\url{http://www.bioconductor.org/packages/2.5}
\end{frame}
\begin{frame}[fragile,fragile,fragile]
\frametitle{Selected bedtools project history}
\begin{verbatim}
VERSION 1.1, 04/23/2009. Initial release.
\end{verbatim}
BEDTools: a flexible suite of utilities for comparing genomic
features. Bioinformatics (2010).
\begin{verbatim}
Version 2.14.1-3 (2-Nov-2011)
...
2.14.3-1 TESTED ubuntu
...
Version 2.17.0 (3-Nov-2012) TESTED guillimin
...
Version 2.22.1 (1 Jan 2015) TESTED github
\end{verbatim}
\includegraphics[height=2cm]{bedtools} = command line C++ program.
Source: bedtools \verb|RELEASE_HISTORY| file.
\end{frame}
\begin{frame}[fragile]
\frametitle{data.table is ``just like a data.frame''}
\includegraphics[height=2cm]{datatable}
data.table = an R package with compiled C code.
\begin{verbatim}
Version: 1.0, 2006-04-12
Author: Matt Dowle
Title: Just like a data.frame but without rownames,
up to 10 times faster, up to 10 times less memory
\end{verbatim}
% Description: This package does very little.
% The only reason for its existence is that the white
% book specifies that data.frame must have rownames.
% This package defines a new class data.table which
% operates just like a data.frame, but uses up to 10
% times less memory, and can be up to 10 times faster
% to create (and copy). It also takes the opportunity
% to allow subset() and with() like expressions inside
% the []. Most of the code is copied from base functions
% with the code manipulating row.names removed.
\end{frame}
\begin{frame}[fragile]
\frametitle{data.table is an ``extension of data.frame''}
\includegraphics[height=2cm]{datatable}
data.table = an R package with compiled C code.
\begin{verbatim}
Version: 1.9.4, 2014-10-02
Author: M Dowle, T Short, S Lianoglou, A Srinivasan
with contributions from R Saporta, E Antonyan
Title: Extension of data.frame
Description: Fast aggregation of large data
(e.g. 100GB in RAM), fast ordered joins,
fast add/modify/delete of columns
by group using no copies at all,
list columns and a fast file reader (fread)...
\end{verbatim}
% Offers a natural and flexible syntax,
% for faster development.
\end{frame}
% \begin{frame}
% \frametitle{Demo in R?}
% \begin{itemize}
% \item \text{data.table()}
% \end{itemize}
% \end{frame}
\begin{frame}[fragile]
\frametitle{data.table supports overlap joins}
data.table/README.md
\begin{verbatim}
Changes in v1.9.4 (on CRAN 2 Oct 2014)
NEW FEATURES
...
Overlap joins (#528) is now here, finally!!
Except for type="equal" and maxgap and minoverlap
arguments, everything else is implemented.
Check out ?foverlaps and the examples there on its usage.
This is a major feature addition to data.table.
\end{verbatim}
\alert{\url{https://github.com/Rdatatable/data.table/wiki/talks/EARL2014_OverlapRangeJoin_Arun.pdf}}
\end{frame}
\begin{frame}
\frametitle{Summary of contenders, demo}
\begin{tabular}{cccc}
&
\includegraphics[height=2.5cm]{bedtools} &
\includegraphics[height=2.5cm]{bioconductor} &
\includegraphics[height=2.5cm]{datatable} \\
& & Bioconductor & data.table \\
\hline
Since & 2009 & 2009 & 2014 \\
Language & C++ & R/C & R/C \\
Versions Tested & 2.14.3 & 3.0 & 1.9.4 \\
& 2.17.0 \\
& 2.22.1
\end{tabular}
\end{frame}
\section{Round 1: timings on genomic data}
\begin{frame}
\frametitle{data.table::fread faster than read.table for bedGraph
files}
\includegraphics[width=\textwidth]{figure-TF-benchmark}
\texttt{read.table(file, sep=, colClasses=, nrows=)} vs
\texttt{fread(file)}
\end{frame}
\begin{frame}[fragile]
\frametitle{Only compute overlapping indices}
\begin{verbatim}
GenomicRanges::findOverlaps(bedGraph.gr, windows.gr)
data.table::foverlaps(bedGraph.dt, windows.dt,
nomatch=0L, which=TRUE)
\end{verbatim}
\includegraphics[width=\textwidth]{figure-TF-benchmark-indices}
\vskip -0.5cm
$\approx 450$ genomic windows.
\end{frame}
\begin{frame}
\frametitle{Start from data.frame, convert to GRanges/data.table,
compute overlapping indices, select data}
\includegraphics[width=\textwidth]{figure-TF-benchmark-tables}
$\approx 450$ genomic windows.
\end{frame}
\begin{frame}
\frametitle{Read files, overlap, write file}
\includegraphics[width=\textwidth]{figure-TF-benchmark-IO}
$\approx 450$ genomic windows.
\end{frame}
\begin{frame}
\frametitle{Read files, overlap, write file}
\includegraphics[width=\textwidth]{figure-TF-benchmark-IO-some}
$\approx 450$ genomic windows.
\end{frame}
% \begin{frame}
% \frametitle{read, overlap, write}
% \includegraphics[width=\textwidth]{figure-TF-benchmark-IO-log}
% $\approx 450$ genomic windows.
% \end{frame}
% \begin{frame}
% \frametitle{data.table::foverlaps slower than
% GenomicRanges::findOverlaps????}
% \includegraphics[width=\textwidth]{figure-full-strand-times}
% 413 genomic windows, size of bedGraph file in million rows.
% \end{frame}
%%%%% 2.14.3
% thocking@silene:~/R/datatable-foverlaps(master*)$ rm overlap.bedGraph; time intersectBed -wa -wb -a windows.bed -b chipseq.bedGraph > overlap.bedGraph && wc -l windows.bed chipseq.bedGraph overlap.bedGraph
% real 0m24.331s
% user 0m22.712s
% sys 0m1.496s
% 479 windows.bed
% 21113886 chipseq.bedGraph
% 110781 overlap.bedGraph
% 21225146 total
%%%%%%%% 2.17.0
% thocking@silene:~/R/datatable-foverlaps(master*)$ rm overlap.bedGraph; time bedtools-2.17.0/bin/intersectBed -wa -wb -a windows.bed -b chipseq.bedGraph > overlap.bedGraph && wc -l windows.bed chipseq.bedGraph overlap.bedGraph
% real 0m45.178s
% user 0m39.152s
% sys 0m3.312s
% 479 windows.bed
% 21113886 chipseq.bedGraph
% 110781 overlap.bedGraph
% 21225146 total
% thocking@silene:~/R/datatable-foverlaps(master*)$ rm overlap.bedGraph; time bedtools-2.17.0/bin/intersectBed -wa -wb -a windows.bed -b chipseq.bedGraph -sorted > overlap.bedGraph && wc -l windows.bed chipseq.bedGraph overlap.bedGraph
% real 0m36.502s
% user 0m35.752s
% sys 0m0.464s
% 479 windows.bed
% 21113886 chipseq.bedGraph
% 110781 overlap.bedGraph
% 21225146 total
%%%%% 2.22.1
% thocking@silene:~/R/datatable-foverlaps(master*)$ rm overlap.bedGraph; time bedtools2/bin/intersectBed -sorted -wa -wb -a windows.bed -b chipseq.bedGraph > overlap.bedGraph && wc -l windows.bed chipseq.bedGraph overlap.bedGraph
% real 0m11.064s
% user 0m10.776s
% sys 0m0.120s
% 479 windows.bed
% 21113886 chipseq.bedGraph
% 110781 overlap.bedGraph
% 21225146 total
% thocking@silene:~/R/datatable-foverlaps(master*)$ rm overlap.bedGraph; time bedtools2/bin/intersectBed -wa -wb -a windows.bed -b chipseq.bedGraph > overlap.bedGraph && wc -l windows.bed chipseq.bedGraph overlap.bedGraph
% real 0m27.094s
% user 0m23.836s
% sys 0m3.148s
% 479 windows.bed
% 21113886 chipseq.bedGraph
% 110781 overlap.bedGraph
% 21225146 total
\section{Round 2: accuracy on genomic data}
\begin{frame}
\frametitle{data.table::foverlaps always gives the same result as
GenomicRanges::findOverlaps}
\includegraphics[height=2.5cm]{datatable} =
\includegraphics[height=2.5cm]{bioconductor}
... but are they correct?
\end{frame}
\begin{frame}[fragile]
\frametitle{Need chromStart+1 in R!}
chipseq.bedGraph
\begin{verbatim}
chrom chromStart chromEnd coverage
chr1 0 200 1
chr1 200 300 2
\end{verbatim}
Expected versus computed coverage for
\begin{itemize}
\item 4 test windows (columns) and
\item 5 overlap methods (rows).
\end{itemize}
\input{table-simple}
\end{frame}
\section{And the champion is...}
\begin{frame}
\frametitle{All packages have similar speed, accuracy}
Caveats:
\begin{itemize}
\item need most recent versions!
\item need chromStart+1 in R!
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Bonus points}
+1 to
\begin{itemize}
\item \textbf{bedtools} for binary files (BAM).
\item \textbf{data.table} for \texttt{fread}\\
(read text files faster than \texttt{read.table}).
\item \textbf{bedtools} and \textbf{GenomicRanges::findOverlaps}
for -f/minoverlap\\
(not yet implemented in \texttt{data.table::foverlaps}).
\item \textbf{GenomicRanges::findOverlaps} for maxgap \\
(not yet implemented in \texttt{data.table::foverlaps}).
\item \textbf{bedtools} for native support for 0-based chromStart of
bed/bedGraph files (need to use chromStart+1 in R).
\end{itemize}
\end{frame}
\section{Google Summer of Code}
\begin{frame}
\frametitle{Google Summer of Code (GSOC)}
Student gets \$5000 for writing open source code for
3 months.
\begin{description}
\item[Feb] \textbf{Admins} for open source organizations
e.g. R, Bioconductor apply to Google.
\item[Mar] \textbf{Mentors} suggest projects for each org.\\
\textbf{Students} submit project proposals to Google.\\
Google gives funding for $n$ students to an org.
\item[April] The top $n$ students get \$500 and begin coding.
\item[July] Midterm evaluation, pass = \$2250.
\item[Aug] Final evaluation, pass = \$2250.
\item[November] Orgs get \$500/student mentored.
\end{description}
I have participated as an \textbf{admin} and \textbf{mentor} for the
R project.
\end{frame}
\begin{frame}
\frametitle{What makes a good GSOC project?}
Coding projects should:
\begin{itemize}
\item Result in free/open-source software.
\item Be 3 months of full time work for a student.
\item Include writing documentation and tests.
\item Not include original research.
\end{itemize}
Examples:
\begin{itemize}
\item Louis/Mathieu can be \textbf{admins} for MUGQIC org.
\item Warren/Stephan can be \textbf{mentors} for a project to write
a new R package for methylation analysis.
\item Robert/Dan can be \textbf{mentors} for a project to implement
new features in Gemini.
\item Any undergrad/master/PhD candidates (at McGill or not)
can be \textbf{students}.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Thanks for your attention!}
Any questions? \texttt{toby.hocking@mail.mcgill.ca}
\vskip 1cm
Source code for benchmarks is at
\url{https://github.com/tdhock/datatable-foverlaps}
\end{frame}
\begin{frame}[fragile]
\frametitle{old intersectBed -sorted is much slower}
``-sorted Invoke a memory-efficient algorithm for very large files.''
Version: 2.14.3-1
% thocking@silene:~/R/datatable-foverlaps(master*)$ time intersectBed -a max-windows.bed -b /home/thocking/genomecov/max/yale_k562_rep1.bedGraph-strand -sorted > overlap.bedGraph
% \begin{verbatim}
% thocking@silene:~/R/datatable-foverlaps(master*)$ time intersectBed -a max-windows.bed -b /home/thocking/genomecov/max/yale_k562_rep1.bedGraph-strand -sorted > overlap.bedGraph
% real 154m10.653s
% user 153m2.188s
% sys 0m20.204s
% thocking@silene:~/R/datatable-foverlaps(master)$ wc -l max-windows.bed /home/thocking/genomecov/max/yale_k562_rep1.bedGraph-strand
% 413 max-windows.bed
% 7617673 /home/thocking/genomecov/max/yale_k562_rep1.bedGraph-strand
% \end{verbatim}
\begin{verbatim}
$ time intersectBed -a windows.bed \
-b chip-seq.bedGraph -sorted > overlap.bedGraph
real 154m10.653s
user 153m2.188s
sys 0m20.204s
$ time intersectBed -a windows.bed \
-b chip-seq.bedGraph > overlap.bedGraph
real 0m8.733s
user 0m7.932s
sys 0m0.728s
\end{verbatim}
% $ wc -l windows.bed chip-seq.bedGraph overlap.bedGraph
% 413 windows.bed
% 7617673 chip-seq.bedGraph
% 47840 overlap.bedGraph
% thocking@silene:~/R/datatable-foverlaps(master*)$ du -ms max-windows.bed /home/thocking/genomecov/max/yale_k562_rep1.bedGraph-strand overlap.bedGraph
% 1 max-windows.bed
% 189 /home/thocking/genomecov/max/yale_k562_rep1.bedGraph-strand
% 2 overlap.bedGraph
\end{frame}
\end{document}