cggh · eauel · Mar 7, 2019 · Mar 23, 2019 · Mar 27, 2019 · Mar 27, 2019
diff --git a/allel/stats/decomposition.py b/allel/stats/decomposition.py
@@ -1,9 +1,8 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import, print_function, division
 
-
 import numpy as np
-
+import dask.array as da
 
 from allel.stats.preprocessing import get_scaler
 
@@ -80,8 +79,6 @@ def fit_transform(self, gn):
         return u
 
     def _fit(self, gn):
-        import scipy.linalg
-
         # apply scaling
         gn = self.scaler_.fit(gn).transform(gn)
 
@@ -91,7 +88,12 @@ def _fit(self, gn):
         n_samples, n_features = x.shape
 
         # singular value decomposition
-        u, s, v = scipy.linalg.svd(x, full_matrices=False)
+        if type(x) is da.Array:
+            from dask.array.linalg import svd as dask_svd
+            u, s, v = dask_svd(x)
+        else:
+            import scipy.linalg
+            u, s, v = scipy.linalg.svd(x, full_matrices=False)
 
         # calculate explained variance
         explained_variance_ = (s ** 2) / n_samples
@@ -209,7 +211,6 @@ def fit_transform(self, gn):
 
     def _fit(self, gn):
         from sklearn.utils.validation import check_random_state
-        from sklearn.utils.extmath import randomized_svd
 
         # apply scaling
         gn = self.scaler_.fit(gn).transform(gn)
@@ -224,9 +225,15 @@ def _fit(self, gn):
         n_samples, n_features = x.shape
 
         # singular value decomposition
-        u, s, v = randomized_svd(x, n_components,
-                                 n_iter=self.iterated_power,
-                                 random_state=random_state)
+        if type(x) is da.Array:
+            from dask.array.linalg import svd_compressed
+            u, s, v = svd_compressed(x, n_components,
+                                     n_power_iter=self.iterated_power)
+        else:
+            from sklearn.utils.extmath import randomized_svd
+            u, s, v = randomized_svd(x, n_components,
+                                     n_iter=self.iterated_power,
+                                     random_state=random_state)
 
         # calculate explained variance
         self.explained_variance_ = exp_var = (s ** 2) / n_samples

diff --git a/allel/stats/preprocessing.py b/allel/stats/preprocessing.py
@@ -1,8 +1,9 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import, print_function, division
 
-
 import numpy as np
+import dask.array as da
+
 
 
 from allel.compat import text_type
@@ -37,11 +38,13 @@ def fit(self, gn):
         # check input
         gn = asarray_ndim(gn, 2)
 
-        # find mean
-        self.mean_ = np.mean(gn, axis=1, keepdims=True)
-
-        # find scaling factor
-        self.std_ = np.std(gn, axis=1, keepdims=True)
+        # find mean and scaling factor
+        if type(gn) is da.Array:
+            self.mean_ = da.mean(gn, axis=1, keepdims=True)
+            self.std_ = da.std(gn, axis=1, keepdims=True)
+        else:
+            self.mean_ = np.mean(gn, axis=1, keepdims=True)
+            self.std_ = np.std(gn, axis=1, keepdims=True)
 
         return self
 
@@ -79,7 +82,10 @@ def fit(self, gn):
         gn = asarray_ndim(gn, 2)
 
         # find mean
-        self.mean_ = np.mean(gn, axis=1, keepdims=True)
+        if type(gn) is da.Array:
+            self.mean_ = da.mean(gn, axis=1, keepdims=True)
+        else:
+            self.mean_ = np.mean(gn, axis=1, keepdims=True)
 
         return self
 
@@ -115,11 +121,17 @@ def fit(self, gn):
         gn = asarray_ndim(gn, 2)
 
         # find mean
-        self.mean_ = np.mean(gn, axis=1, keepdims=True)
+        if type(gn) is da.Array:
+            self.mean_ = da.mean(gn, axis=1, keepdims=True)
+        else:
+            self.mean_ = np.mean(gn, axis=1, keepdims=True)
 
         # find scaling factor
         p = self.mean_ / self.ploidy
-        self.std_ = np.sqrt(p * (1 - p))
+        if type(gn) is da.Array:
+            self.std_ = da.sqrt(p * (1 - p))
+        else:
+            self.std_ = np.sqrt(p * (1 - p))
 
         return self
 

diff --git a/allel/util.py b/allel/util.py
@@ -8,6 +8,7 @@
 
 import numpy as np
 
+import dask.array as da
 
 from allel.compat import string_types
 
@@ -49,7 +50,12 @@ def asarray_ndim(a, *ndims, **kwargs):
     kwargs.setdefault('copy', False)
     if a is None and allow_none:
         return None
-    a = np.array(a, **kwargs)
+    if type(a) is da.Array:
+        # Remove copy kwarg if it exists (Dask does not support this parameter)
+        kwargs.pop('copy', False)
+        a = da.array(a, **kwargs)
+    else:
+        a = np.array(a, **kwargs)
     if a.ndim not in ndims:
         if len(ndims) > 1:
             expect_str = 'one of %s' % str(ndims)