|
| 1 | +# Cython implementation of the XL-mHG test |
| 2 | +# Copyright (c) 2015 Florian Wagner |
| 3 | +# |
| 4 | +# This program is free software: you can redistribute it and/or modify |
| 5 | +# it under the terms of the GNU General Public License, Version 3, |
| 6 | +# as published by the Free Software Foundation. |
| 7 | +# |
| 8 | +# This program is distributed in the hope that it will be useful, |
| 9 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 11 | +# GNU General Public License for more details. |
| 12 | +# |
| 13 | +# You should have received a copy of the GNU General Public License |
| 14 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 15 | + |
| 16 | +#cython: profile=False |
| 17 | +#cython: wraparound=False |
| 18 | +#cython: boundscheck=False |
| 19 | +#cython: cdivision=True |
| 20 | + |
| 21 | +cimport cython |
| 22 | + |
| 23 | +import numpy as np |
| 24 | +cimport numpy as np |
| 25 | + |
| 26 | +np.import_array() |
| 27 | + |
| 28 | +cdef extern from "math.h": |
| 29 | + long double fabsl(long double x) |
| 30 | + double NAN |
| 31 | + |
| 32 | +cdef inline int is_equal(long double a, long double b, long double tol): |
| 33 | + # tests equality of two floating point numbers (of type long doube => 80-bit extended precision) |
| 34 | + if a == b: |
| 35 | + return 1 |
| 36 | + elif fabsl(a-b)/max(fabsl(a),fabsl(b)) < tol: |
| 37 | + return 1 |
| 38 | + else: |
| 39 | + return 0 |
| 40 | + |
| 41 | + |
| 42 | +cdef long double get_hypergeometric_pvalue(\ |
| 43 | + long double p, int k, int N, int K, int n): |
| 44 | + # calculates hypergeometric p-value when P(k | N,K,n) is already known |
| 45 | + cdef long double pval = p |
| 46 | + cdef int i |
| 47 | + for i in range(k,min(K,n)): |
| 48 | + p *= (<long double>((n-i)*(K-i)) /\ |
| 49 | + <long double>((i+1)*(N-K-n+i+1))) |
| 50 | + pval += p |
| 51 | + return pval |
| 52 | + |
| 53 | + |
| 54 | +cdef int get_mHG(unsigned char[::1] v, int N, int K, int L, int X, |
| 55 | + long double[::1] mHG_array, |
| 56 | + long double tol): |
| 57 | + # calculates XL-mHG test statistic |
| 58 | + # stores statistic in supplied array, and returns threshold at which minimum is achieved |
| 59 | + |
| 60 | + if K == 0 or K == N or K < X: |
| 61 | + mHG_array[0] = 1.0 |
| 62 | + return 0 |
| 63 | + |
| 64 | + cdef int k = 0 |
| 65 | + cdef long double p = 1.0 |
| 66 | + cdef long double pval |
| 67 | + cdef long double mHG = 1.1 |
| 68 | + cdef int threshold = 0 |
| 69 | + cdef int n |
| 70 | + for n in range(L): |
| 71 | + if v[n] == 0: |
| 72 | + # calculate P(k | N,K,n+1) from P(k | N,K,n) |
| 73 | + p *= (<long double>((n+1)*(N-K-n+k)) /\ |
| 74 | + <long double>((N-n)*(n-k+1))); |
| 75 | + else: |
| 76 | + # hit one => calculate hypergeometric p-value |
| 77 | + # calculate P(k+1 | N,K,n+1) from P(k | N,K,n) |
| 78 | + p *= (<long double>((n+1)*(K-k)) /\ |
| 79 | + <long double>((N-n)*(k+1))); |
| 80 | + k += 1 |
| 81 | + if k >= X: # calculate p-value only if enough elements have been seen |
| 82 | + pval = get_hypergeometric_pvalue(p,k,N,K,n+1) |
| 83 | + |
| 84 | + if pval < mHG and (not is_equal(pval,mHG,tol)): |
| 85 | + # make sure we don't set mHG to something negative |
| 86 | + if pval < 0: |
| 87 | + mHG = 0 |
| 88 | + else: |
| 89 | + mHG = pval |
| 90 | + threshold = n+1 |
| 91 | + |
| 92 | + if threshold == 0: # there were not enough positives in v[:L] |
| 93 | + mHG_array[0] = 1.0 |
| 94 | + else: |
| 95 | + mHG_array[0] = mHG |
| 96 | + return threshold |
| 97 | + |
| 98 | + |
| 99 | +cdef long double get_mHG_pvalue(int N, int K, int L, int X,\ |
| 100 | + long double mHG,\ |
| 101 | + long double[:,::1] matrix,\ |
| 102 | + long double tol): |
| 103 | + # calculates XL-mHG p-value |
| 104 | + |
| 105 | + # cheap checks |
| 106 | + if mHG > 1.0 or is_equal(mHG,1.0,tol): |
| 107 | + return 1.0 |
| 108 | + elif mHG == 0: |
| 109 | + return 0 |
| 110 | + elif K == 0 or K >= N or K < X: |
| 111 | + return 0 |
| 112 | + elif L > N: |
| 113 | + return 0 |
| 114 | + |
| 115 | + # initialization |
| 116 | + cdef int W = N-K |
| 117 | + cdef int n,k,w |
| 118 | + |
| 119 | + cdef long double p_start = 1.0 |
| 120 | + cdef long double p |
| 121 | + cdef long double pval |
| 122 | + matrix[0,0] = 1.0 |
| 123 | + |
| 124 | + # go over all thresholds, except last |
| 125 | + for n in range(1,N): |
| 126 | + |
| 127 | + if K >= n: |
| 128 | + k = n |
| 129 | + p_start *= ((<long double>(K - n + 1)) /\ |
| 130 | + (<long double>(N - n + 1))) |
| 131 | + else: |
| 132 | + k = K |
| 133 | + p_start *= ((<long double>n) /\ |
| 134 | + <long double>(n - K)) |
| 135 | + |
| 136 | + if p_start <= 0: |
| 137 | + # not enough floating point precision to calculate p-value |
| 138 | + return <long double>NAN |
| 139 | + |
| 140 | + p = p_start |
| 141 | + pval = p_start |
| 142 | + w = n - k |
| 143 | + |
| 144 | + # R is the space of configurations with mHG better than or equal to the one observed |
| 145 | + # - go over all configurations for threshold n |
| 146 | + # - start with highest possible enrichment and then go down |
| 147 | + # - as long as we're in R, all paths going through this configuration are "doomed" |
| 148 | + # - because we're using (K x W) grid instead of parallelogram, "going down" becomes going down and right... |
| 149 | + |
| 150 | + # no configuration with threshold > L or threshold < X can be in R |
| 151 | + if n <= L and n >= X: |
| 152 | + # find the first configuration that's not in R |
| 153 | + # this happens when either k < X, or hypergeometric p-value > mHG |
| 154 | + # if k == 0 or w == W, we have hypergeometric p-value = 1 |
| 155 | + # since mHG < 1, as soon as k == 0 or w == W, we have left R |
| 156 | + while k >= X and w < W and (is_equal(pval,mHG,tol) or pval < mHG): |
| 157 | + # k > 0 is implied |
| 158 | + matrix[k,w] = 0 # we're still in R |
| 159 | + p *= ((<long double>(k*(N-K-n+k))) / (<long double>((n-k+1)*(K-k+1)))) |
| 160 | + pval += p |
| 161 | + w += 1 |
| 162 | + k -= 1 |
| 163 | + |
| 164 | + # fill in rest of the matrix based on entries for threshold n-1 |
| 165 | + while k >= 0 and w <= W: |
| 166 | + if w > 0 and k > 0: |
| 167 | + matrix[k,w] = matrix[k,w-1]*(<long double>(W-w+1))/(<long double>(N-n+1)) +\ |
| 168 | + matrix[k-1,w]*(<long double>(K-k+1))/(<long double>(N-n+1)) |
| 169 | + elif w > 0: |
| 170 | + matrix[k,w] = matrix[k,w-1]*(<long double>(W-w+1))/(<long double>(N-n+1)) |
| 171 | + |
| 172 | + elif k > 0: |
| 173 | + matrix[k,w] = matrix[k-1,w]*(<long double>(K-k+1))/(<long double>(N-n+1)) |
| 174 | + |
| 175 | + w += 1 |
| 176 | + k -= 1 |
| 177 | + |
| 178 | + return 1.0 - (matrix[K,W-1] + matrix[K-1,W]) |
| 179 | + |
| 180 | + |
| 181 | +def mHG_test(unsigned char[::1] v, int N, int K, int L, int X, mat=None, use_upper_bound=False, verbose=False, tolerance=1e-16): |
| 182 | + # Front-end for the XL-mHG test. |
| 183 | + |
| 184 | + # sanity checks |
| 185 | + assert N >= 0 |
| 186 | + assert 0 <= K <= N |
| 187 | + assert 0 <= L <= N |
| 188 | + assert 0 <= X <= K |
| 189 | + |
| 190 | + if K == 0 or K == N: # check if we have any positives at all, or if all entries are positives |
| 191 | + return 0,1.0,1.0 |
| 192 | + |
| 193 | + cdef long double [:,::1] matrix |
| 194 | + if mat is None: |
| 195 | + # intialize matrix array |
| 196 | + matrix = np.empty((K+1,N-K+1),dtype=np.longdouble) |
| 197 | + else: |
| 198 | + # check whether the supplied matrix is valid |
| 199 | + assert mat.dtype == np.longdouble |
| 200 | + assert mat.flags['C_CONTIGUOUS'] |
| 201 | + assert mat.shape[0] >= K+1 and mat.shape[1] >= N-K+1 |
| 202 | + matrix = mat |
| 203 | + |
| 204 | + cdef long double tol = <long double>tolerance |
| 205 | + cdef int threshold |
| 206 | + cdef long double mHG,mHG_pvalue |
| 207 | + cdef double mHG_double,mHG_pvalue_double |
| 208 | + |
| 209 | + # get XL-mHG and corresponding threshold |
| 210 | + cdef long double[::1] mHG_array = np.zeros(1,dtype=np.longdouble) |
| 211 | + threshold = get_mHG(v, N, K, L, X, mHG_array, tol) |
| 212 | + mHG = mHG_array[0] |
| 213 | + if is_equal(mHG,1.0,tol): # check if there is anything going on at all |
| 214 | + return threshold,1.0,1.0 |
| 215 | + |
| 216 | + if use_upper_bound: |
| 217 | + # don't calculate XL-mHG p-value, use upper bound instead |
| 218 | + mHG_pvalue_double = <double>min(1.0,mHG*(<long double>K)) |
| 219 | + |
| 220 | + else: |
| 221 | + # calculate XL-mHG p-value |
| 222 | + mHG_pvalue = get_mHG_pvalue(N, K, L, X, mHG, matrix, tol) |
| 223 | + # convert to double precision |
| 224 | + mHG_pvalue_double = <double>mHG_pvalue |
| 225 | + |
| 226 | + # check whether floating point accuracy was insufficient for calculation of the p-value |
| 227 | + if mHG_pvalue_double <= 0 or np.isnan(mHG_pvalue_double): |
| 228 | + # if so, use upper bound instead |
| 229 | + mHG_pvalue_double = <double>(mHG*(<long double>K)) |
| 230 | + |
| 231 | + mHG_double = <double>mHG |
| 232 | + return threshold,mHG_double,mHG_pvalue_double |
0 commit comments