Skip to content

Commit 8292cc8

Browse files
author
Florian Wagner
committed
First commit
1 parent d6ffa0e commit 8292cc8

File tree

5 files changed

+269
-0
lines changed

5 files changed

+269
-0
lines changed

README.md

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# XL-mHG
2+
3+
The XL-mHG is a nonparametric enrichment test for ranked binary data, and an extension of the mHG test. The mHG test was developed by [Dr. Zohar Yakhini](http://bioinfo.cs.technion.ac.il/people/zohar) and colleagues.
4+
5+
If you use the XL-mHG in your research, please cite [Eden et al. (2007)](10.1371/journal.pcbi.0030039) and [Wagner (2015)](http://dx.doi.org/10.1101/018705).
6+
7+
# Requirements and Installation
8+
9+
This algorithm requires the Python packages NumPy and Cython, and was developed under Linux using Python 2.7. Running `make` should generate `xlmHG_cython.so`, which can then be imported from any Python script using `import xlmHG_cython`.

__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
import sys
2+
3+
__all__ = ['xlmHG_cython']

makefile

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
all: cython
2+
3+
cython: xlmHG_cython.pyx
4+
python2 setup.py build_ext --inplace

setup.py

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from distutils.core import setup
2+
from distutils.extension import Extension
3+
from Cython.Distutils import build_ext
4+
#from Cython.Distutils import Extension
5+
6+
import numpy as np
7+
8+
include_dirs = []
9+
library_dirs = []
10+
11+
ext_modules = []
12+
ext_modules.append(Extension("xlmHG_cython", ["xlmHG_cython.pyx"],\
13+
library_dirs=library_dirs,
14+
extra_link_args=['-fPIC'],
15+
include_dirs=[np.get_include()]+include_dirs))
16+
17+
setup(
18+
name = 'cython tools',
19+
cmdclass = {'build_ext': build_ext},
20+
ext_modules = ext_modules,
21+
)

xlmHG_cython.pyx

+232
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
# Cython implementation of the XL-mHG test
2+
# Copyright (c) 2015 Florian Wagner
3+
#
4+
# This program is free software: you can redistribute it and/or modify
5+
# it under the terms of the GNU General Public License, Version 3,
6+
# as published by the Free Software Foundation.
7+
#
8+
# This program is distributed in the hope that it will be useful,
9+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11+
# GNU General Public License for more details.
12+
#
13+
# You should have received a copy of the GNU General Public License
14+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
15+
16+
#cython: profile=False
17+
#cython: wraparound=False
18+
#cython: boundscheck=False
19+
#cython: cdivision=True
20+
21+
cimport cython
22+
23+
import numpy as np
24+
cimport numpy as np
25+
26+
np.import_array()
27+
28+
cdef extern from "math.h":
29+
long double fabsl(long double x)
30+
double NAN
31+
32+
cdef inline int is_equal(long double a, long double b, long double tol):
33+
# tests equality of two floating point numbers (of type long doube => 80-bit extended precision)
34+
if a == b:
35+
return 1
36+
elif fabsl(a-b)/max(fabsl(a),fabsl(b)) < tol:
37+
return 1
38+
else:
39+
return 0
40+
41+
42+
cdef long double get_hypergeometric_pvalue(\
43+
long double p, int k, int N, int K, int n):
44+
# calculates hypergeometric p-value when P(k | N,K,n) is already known
45+
cdef long double pval = p
46+
cdef int i
47+
for i in range(k,min(K,n)):
48+
p *= (<long double>((n-i)*(K-i)) /\
49+
<long double>((i+1)*(N-K-n+i+1)))
50+
pval += p
51+
return pval
52+
53+
54+
cdef int get_mHG(unsigned char[::1] v, int N, int K, int L, int X,
55+
long double[::1] mHG_array,
56+
long double tol):
57+
# calculates XL-mHG test statistic
58+
# stores statistic in supplied array, and returns threshold at which minimum is achieved
59+
60+
if K == 0 or K == N or K < X:
61+
mHG_array[0] = 1.0
62+
return 0
63+
64+
cdef int k = 0
65+
cdef long double p = 1.0
66+
cdef long double pval
67+
cdef long double mHG = 1.1
68+
cdef int threshold = 0
69+
cdef int n
70+
for n in range(L):
71+
if v[n] == 0:
72+
# calculate P(k | N,K,n+1) from P(k | N,K,n)
73+
p *= (<long double>((n+1)*(N-K-n+k)) /\
74+
<long double>((N-n)*(n-k+1)));
75+
else:
76+
# hit one => calculate hypergeometric p-value
77+
# calculate P(k+1 | N,K,n+1) from P(k | N,K,n)
78+
p *= (<long double>((n+1)*(K-k)) /\
79+
<long double>((N-n)*(k+1)));
80+
k += 1
81+
if k >= X: # calculate p-value only if enough elements have been seen
82+
pval = get_hypergeometric_pvalue(p,k,N,K,n+1)
83+
84+
if pval < mHG and (not is_equal(pval,mHG,tol)):
85+
# make sure we don't set mHG to something negative
86+
if pval < 0:
87+
mHG = 0
88+
else:
89+
mHG = pval
90+
threshold = n+1
91+
92+
if threshold == 0: # there were not enough positives in v[:L]
93+
mHG_array[0] = 1.0
94+
else:
95+
mHG_array[0] = mHG
96+
return threshold
97+
98+
99+
cdef long double get_mHG_pvalue(int N, int K, int L, int X,\
100+
long double mHG,\
101+
long double[:,::1] matrix,\
102+
long double tol):
103+
# calculates XL-mHG p-value
104+
105+
# cheap checks
106+
if mHG > 1.0 or is_equal(mHG,1.0,tol):
107+
return 1.0
108+
elif mHG == 0:
109+
return 0
110+
elif K == 0 or K >= N or K < X:
111+
return 0
112+
elif L > N:
113+
return 0
114+
115+
# initialization
116+
cdef int W = N-K
117+
cdef int n,k,w
118+
119+
cdef long double p_start = 1.0
120+
cdef long double p
121+
cdef long double pval
122+
matrix[0,0] = 1.0
123+
124+
# go over all thresholds, except last
125+
for n in range(1,N):
126+
127+
if K >= n:
128+
k = n
129+
p_start *= ((<long double>(K - n + 1)) /\
130+
(<long double>(N - n + 1)))
131+
else:
132+
k = K
133+
p_start *= ((<long double>n) /\
134+
<long double>(n - K))
135+
136+
if p_start <= 0:
137+
# not enough floating point precision to calculate p-value
138+
return <long double>NAN
139+
140+
p = p_start
141+
pval = p_start
142+
w = n - k
143+
144+
# R is the space of configurations with mHG better than or equal to the one observed
145+
# - go over all configurations for threshold n
146+
# - start with highest possible enrichment and then go down
147+
# - as long as we're in R, all paths going through this configuration are "doomed"
148+
# - because we're using (K x W) grid instead of parallelogram, "going down" becomes going down and right...
149+
150+
# no configuration with threshold > L or threshold < X can be in R
151+
if n <= L and n >= X:
152+
# find the first configuration that's not in R
153+
# this happens when either k < X, or hypergeometric p-value > mHG
154+
# if k == 0 or w == W, we have hypergeometric p-value = 1
155+
# since mHG < 1, as soon as k == 0 or w == W, we have left R
156+
while k >= X and w < W and (is_equal(pval,mHG,tol) or pval < mHG):
157+
# k > 0 is implied
158+
matrix[k,w] = 0 # we're still in R
159+
p *= ((<long double>(k*(N-K-n+k))) / (<long double>((n-k+1)*(K-k+1))))
160+
pval += p
161+
w += 1
162+
k -= 1
163+
164+
# fill in rest of the matrix based on entries for threshold n-1
165+
while k >= 0 and w <= W:
166+
if w > 0 and k > 0:
167+
matrix[k,w] = matrix[k,w-1]*(<long double>(W-w+1))/(<long double>(N-n+1)) +\
168+
matrix[k-1,w]*(<long double>(K-k+1))/(<long double>(N-n+1))
169+
elif w > 0:
170+
matrix[k,w] = matrix[k,w-1]*(<long double>(W-w+1))/(<long double>(N-n+1))
171+
172+
elif k > 0:
173+
matrix[k,w] = matrix[k-1,w]*(<long double>(K-k+1))/(<long double>(N-n+1))
174+
175+
w += 1
176+
k -= 1
177+
178+
return 1.0 - (matrix[K,W-1] + matrix[K-1,W])
179+
180+
181+
def mHG_test(unsigned char[::1] v, int N, int K, int L, int X, mat=None, use_upper_bound=False, verbose=False, tolerance=1e-16):
182+
# Front-end for the XL-mHG test.
183+
184+
# sanity checks
185+
assert N >= 0
186+
assert 0 <= K <= N
187+
assert 0 <= L <= N
188+
assert 0 <= X <= K
189+
190+
if K == 0 or K == N: # check if we have any positives at all, or if all entries are positives
191+
return 0,1.0,1.0
192+
193+
cdef long double [:,::1] matrix
194+
if mat is None:
195+
# intialize matrix array
196+
matrix = np.empty((K+1,N-K+1),dtype=np.longdouble)
197+
else:
198+
# check whether the supplied matrix is valid
199+
assert mat.dtype == np.longdouble
200+
assert mat.flags['C_CONTIGUOUS']
201+
assert mat.shape[0] >= K+1 and mat.shape[1] >= N-K+1
202+
matrix = mat
203+
204+
cdef long double tol = <long double>tolerance
205+
cdef int threshold
206+
cdef long double mHG,mHG_pvalue
207+
cdef double mHG_double,mHG_pvalue_double
208+
209+
# get XL-mHG and corresponding threshold
210+
cdef long double[::1] mHG_array = np.zeros(1,dtype=np.longdouble)
211+
threshold = get_mHG(v, N, K, L, X, mHG_array, tol)
212+
mHG = mHG_array[0]
213+
if is_equal(mHG,1.0,tol): # check if there is anything going on at all
214+
return threshold,1.0,1.0
215+
216+
if use_upper_bound:
217+
# don't calculate XL-mHG p-value, use upper bound instead
218+
mHG_pvalue_double = <double>min(1.0,mHG*(<long double>K))
219+
220+
else:
221+
# calculate XL-mHG p-value
222+
mHG_pvalue = get_mHG_pvalue(N, K, L, X, mHG, matrix, tol)
223+
# convert to double precision
224+
mHG_pvalue_double = <double>mHG_pvalue
225+
226+
# check whether floating point accuracy was insufficient for calculation of the p-value
227+
if mHG_pvalue_double <= 0 or np.isnan(mHG_pvalue_double):
228+
# if so, use upper bound instead
229+
mHG_pvalue_double = <double>(mHG*(<long double>K))
230+
231+
mHG_double = <double>mHG
232+
return threshold,mHG_double,mHG_pvalue_double

0 commit comments

Comments
 (0)