Minor updates and bug fixes.

shz9 · Dec 3, 2024 · 984919b · 984919b
1 parent 04cc6ac
commit 984919b
Show file tree

Hide file tree

Showing 9 changed files with 53 additions and 27 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -24,6 +24,7 @@ not work well for very large datasets with millions of variants and it causes ov
 - Updated `from_plink_table` method in `LDMatrix` to handle cases where boundaries are different from what 
 `plink` computes.
 - Fixed bug in `symmetrize_ut_csr_matrix` utility functions.
+- Changed default storage data type for LD matrices to `int16`.
 
 ### Added
 

diff --git a/bin/magenpy_ld b/bin/magenpy_ld
@@ -82,7 +82,7 @@ parser.add_argument('--metadata', dest='metadata', type=str,
 
 # Argument for the float precision:
 parser.add_argument('--storage-dtype', dest='storage_dtype', type=str,
-                    default='int8', help='The data type for the entries of the LD matrix.',
+                    default='int16', help='The data type for the entries of the LD matrix.',
                     choices={'float32', 'float64', 'int16', 'int8'})
 
 # Other options:

diff --git a/magenpy/GWADataLoader.py b/magenpy/GWADataLoader.py
@@ -635,7 +635,7 @@ def release_ld(self):
     def compute_ld(self,
                    estimator,
                    output_dir,
-                   dtype='int8',
+                   dtype='int16',
                    compressor_name='zstd',
                    compression_level=7,
                    compute_spectral_properties=False,

diff --git a/magenpy/GenotypeMatrix.py b/magenpy/GenotypeMatrix.py
@@ -343,7 +343,7 @@ def get_snp_attribute(self, attr):
     def compute_ld(self,
                    estimator,
                    output_dir,
-                   dtype='int8',
+                   dtype='int16',
                    compressor_name='zstd',
                    compression_level=7,
                    compute_spectral_properties=False,

diff --git a/magenpy/LDMatrix.py b/magenpy/LDMatrix.py
@@ -22,6 +22,8 @@ class LDMatrix(object):
     * Initialize an `LDMatrix` object from a Zarr array store.
     * Compute LD scores for each SNP in the LD matrix.
     * Filter the LD matrix based on SNP indices or ranges.
+    * Perform linear algebra operations on LD matrices, including SVD, estimating extremal eigenvalues,
+    and efficient matrix-vector multiplication.
 
     The Zarr hierarchy is structured as follows:
 
@@ -156,7 +158,7 @@ def from_csr(cls,
                  csr_mat,
                  store_path,
                  overwrite=False,
-                 dtype='int8',
+                 dtype='int16',
                  compressor_name='zstd',
                  compression_level=7):
         """
@@ -215,7 +217,7 @@ def from_plink_table(cls,
                          ld_boundaries=None,
                          pandas_chunksize=None,
                          overwrite=False,
-                         dtype='int8',
+                         dtype='int16',
                          compressor_name='zstd',
                          compression_level=7):
         """
@@ -332,7 +334,7 @@ def from_dense_zarr_matrix(cls,
                                store_path,
                                overwrite=False,
                                delete_original=False,
-                               dtype='int8',
+                               dtype='int16',
                                compressor_name='zstd',
                                compression_level=7):
         """
@@ -433,7 +435,7 @@ def from_ragged_zarr_matrix(cls,
                                 store_path,
                                 overwrite=False,
                                 delete_original=False,
-                                dtype='int8',
+                                dtype='int16',
                                 compressor_name='zstd',
                                 compression_level=7):
         """
@@ -1322,7 +1324,7 @@ def estimate_extremal_eigenvalues(self,
     def get_lambda_min(self, aggregate=None, min_max_ratio=0.):
         """
         A utility method to compute the `lambda_min` value for the LD matrix. `lambda_min` is the smallest
-        eigenvalue of the LD matrix and this quantity can be useful to know about in some applications.
+        algebraic eigenvalue of the LD matrix. This quantity is useful to know in some applications.
         The function retrieves minimum eigenvalue (if pre-computed and stored) per block and maps it
         to each variant in the corresponding block. If minimum eigenvalues per block are not available,
          we use global minimum eigenvalue (either from matrix attributes or we compute it on the spot).
@@ -1331,20 +1333,23 @@ def get_lambda_min(self, aggregate=None, min_max_ratio=0.):
 
         abs(min(lambda_min, 0.))
 
-        This implies that if the minimum eigenvalue is positive, we just return 0. for `lambda_min`. We are mainly
+        This implies that if the minimum eigenvalue is non-negative, we just return 0. for `lambda_min`. We are mainly
         interested in negative eigenvalues here (if they exist).
 
         :param aggregate: A summary of the minimum eigenvalue across variants or across blocks (if available).
-        Supported aggregation functions are `mean_block`, `median_block`, `min_block`, and `min`. If `min` is selected,
-        we return the minimum eigenvalue for the entire matrix (rather than sub-blocks of it).
+        Supported aggregation functions are `min_block` and `min`. If `min` is selected,
+        we return the minimum eigenvalue for the entire matrix (rather than sub-blocks of it). If `min_block` is
+        selected, we return the minimum eigenvalue for each block separately (mapped to variants within that block).
+
         :param min_max_ratio: The ratio between the absolute values of the minimum and maximum eigenvalues.
         This could be used to target a particular threshold for the minimum eigenvalue.
 
-        :return: The `lambda_min` value for the LD matrix.
+        :return: The absolute value of the minimum eigenvalue for the LD matrix. If the minimum
+        eigenvalue is non-negative, we return zero.
         """
 
         if aggregate is not None:
-            assert aggregate in ('mean_block', 'median_block', 'min_block', 'min')
+            assert aggregate in ('min_block', 'min')
 
         # Get the attributes of the LD store:
         store_attrs = self.list_store_attributes()
@@ -1365,7 +1370,7 @@ def threshold_lambda_min(eigs):
 
             spectral_props = self.get_store_attr('Spectral properties')
 
-            if aggregate in ('mean_block', 'median_block', 'min_block'):
+            if aggregate == 'min_block':
                 assert 'Eigenvalues per block' in spectral_props, (
                     'Aggregating lambda_min across blocks '
                     'requires that these blocks are pre-defined.')
@@ -1406,10 +1411,6 @@ def threshold_lambda_min(eigs):
                     if self._mask is not None:
                         lambda_min = lambda_min[self._mask]
 
-                elif aggregate == 'mean_block':
-                    lambda_min = np.mean(block_eigs['min'])
-                elif aggregate == 'median_block':
-                    lambda_min = np.median(block_eigs['min'])
                 elif aggregate == 'min_block':
                     lambda_min = np.min(block_eigs['min'])
 
@@ -1433,6 +1434,30 @@ def estimate_uncompressed_size(self, dtype=None):
 
         return 2.*self._zg['matrix/data'].shape[0]*np.dtype(dtype).itemsize / 1024 ** 2
 
+    def get_total_stored_bytes(self):
+        """
+        Estimate the storage size for all elements of the `LDMatrix` hierarchy,
+        including the LD data arrays, metadata arrays, and attributes.
+
+        :return: The estimated size of the stored and compressed LDMatrix object in bytes.
+        """
+
+        total_bytes = 0
+
+        # Estimate contribution of matrix arrays
+        for arr_name, array in self.zarr_group.matrix.arrays():
+            total_bytes += array.nbytes_stored
+
+        # Estimate contribution of metadata arrays
+        for arr_name, array in self.zarr_group.metadata.arrays():
+            total_bytes += array.nbytes_stored
+
+        # Estimate the contribution of the attributes:
+        if hasattr(self.zarr_group, 'attrs'):
+            total_bytes += len(str(dict(self.zarr_group.attrs)).encode('utf-8'))
+
+        return total_bytes
+
     def get_metadata(self, key, apply_mask=True):
         """
         Get the metadata associated with each variant in the LD matrix.

diff --git a/magenpy/__init__.py b/magenpy/__init__.py
@@ -17,7 +17,7 @@
 from .utils.data_utils import *
 
 __version__ = '0.1.4'
-__release_date__ = 'October 2024'
+__release_date__ = 'December 2024'
 
 
 config = configparser.ConfigParser()

diff --git a/magenpy/stats/ld/estimator.py b/magenpy/stats/ld/estimator.py
@@ -142,7 +142,7 @@ def compute(self,
                 output_dir,
                 overwrite=True,
                 delete_original=True,
-                dtype='int8',
+                dtype='int16',
                 compressor_name='zstd',
                 compression_level=7,
                 compute_spectral_properties=False) -> LDMatrix:
@@ -341,7 +341,7 @@ def compute(self,
                 output_dir,
                 overwrite=True,
                 delete_original=True,
-                dtype='int8',
+                dtype='int16',
                 compressor_name='zstd',
                 compression_level=7,
                 compute_spectral_properties=False) -> LDMatrix:
@@ -425,7 +425,6 @@ def compute(self,
             n_snps_after = ld_mat.n_snps
 
             if n_snps_after < n_snps_before:
-
                 spectral_prop['Extremal (excluding LRLD)'] = ld_mat.estimate_extremal_eigenvalues()
 
             # Update or set the spectral properties attribute:
@@ -510,7 +509,7 @@ def compute(self,
                 output_dir,
                 overwrite=True,
                 delete_original=True,
-                dtype='int8',
+                dtype='int16',
                 compressor_name='zstd',
                 compression_level=7,
                 compute_spectral_properties=False,
@@ -694,7 +693,7 @@ def compute(self,
                 output_dir,
                 overwrite=True,
                 delete_original=True,
-                dtype='int8',
+                dtype='int16',
                 compressor_name='zstd',
                 compression_level=7,
                 compute_spectral_properties=False) -> LDMatrix:

diff --git a/magenpy/stats/ld/utils.py b/magenpy/stats/ld/utils.py
@@ -400,7 +400,7 @@ def harmonic_series_sum(n):
     return ld_mat_obj
 
 
-def estimate_rows_per_chunk(rows, cols, dtype='int8', mem_size=128):
+def estimate_rows_per_chunk(rows, cols, dtype='int16', mem_size=128):
     """
     Estimate the number of rows per chunk for matrices conditional on the desired size of the chunk in MB.
     The estimator takes as input the number of rows, columns, data type, and projected size of the chunk in memory.
@@ -426,7 +426,7 @@ def compute_ld_plink1p9(genotype_matrix,
                         trim_boundaries=False,
                         temp_dir='temp',
                         overwrite=True,
-                        dtype='int8',
+                        dtype='int16',
                         compressor_name='zstd',
                         compression_level=7):
 
@@ -539,7 +539,7 @@ def compute_ld_xarray(genotype_matrix,
                       temp_dir='temp',
                       overwrite=True,
                       delete_original=True,
-                      dtype='int8',
+                      dtype='int16',
                       compressor_name='zstd',
                       compression_level=7):
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,6 @@
 dask<=2024.1.0  # Seen installation issues with newer versions
 scipy
+xarray<=2024.7.0
 numpy<2
 pandas
 pandas-plink