Skip to content

Commit

Permalink
Minor updates and bug fixes.
Browse files Browse the repository at this point in the history
  • Loading branch information
shz9 committed Dec 3, 2024
1 parent 04cc6ac commit 984919b
Show file tree
Hide file tree
Showing 9 changed files with 53 additions and 27 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ not work well for very large datasets with millions of variants and it causes ov
- Updated `from_plink_table` method in `LDMatrix` to handle cases where boundaries are different from what
`plink` computes.
- Fixed bug in `symmetrize_ut_csr_matrix` utility functions.
- Changed default storage data type for LD matrices to `int16`.

### Added

Expand Down
2 changes: 1 addition & 1 deletion bin/magenpy_ld
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ parser.add_argument('--metadata', dest='metadata', type=str,

# Argument for the float precision:
parser.add_argument('--storage-dtype', dest='storage_dtype', type=str,
default='int8', help='The data type for the entries of the LD matrix.',
default='int16', help='The data type for the entries of the LD matrix.',
choices={'float32', 'float64', 'int16', 'int8'})

# Other options:
Expand Down
2 changes: 1 addition & 1 deletion magenpy/GWADataLoader.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,7 +635,7 @@ def release_ld(self):
def compute_ld(self,
estimator,
output_dir,
dtype='int8',
dtype='int16',
compressor_name='zstd',
compression_level=7,
compute_spectral_properties=False,
Expand Down
2 changes: 1 addition & 1 deletion magenpy/GenotypeMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ def get_snp_attribute(self, attr):
def compute_ld(self,
estimator,
output_dir,
dtype='int8',
dtype='int16',
compressor_name='zstd',
compression_level=7,
compute_spectral_properties=False,
Expand Down
55 changes: 40 additions & 15 deletions magenpy/LDMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ class LDMatrix(object):
* Initialize an `LDMatrix` object from a Zarr array store.
* Compute LD scores for each SNP in the LD matrix.
* Filter the LD matrix based on SNP indices or ranges.
* Perform linear algebra operations on LD matrices, including SVD, estimating extremal eigenvalues,
and efficient matrix-vector multiplication.
The Zarr hierarchy is structured as follows:
Expand Down Expand Up @@ -156,7 +158,7 @@ def from_csr(cls,
csr_mat,
store_path,
overwrite=False,
dtype='int8',
dtype='int16',
compressor_name='zstd',
compression_level=7):
"""
Expand Down Expand Up @@ -215,7 +217,7 @@ def from_plink_table(cls,
ld_boundaries=None,
pandas_chunksize=None,
overwrite=False,
dtype='int8',
dtype='int16',
compressor_name='zstd',
compression_level=7):
"""
Expand Down Expand Up @@ -332,7 +334,7 @@ def from_dense_zarr_matrix(cls,
store_path,
overwrite=False,
delete_original=False,
dtype='int8',
dtype='int16',
compressor_name='zstd',
compression_level=7):
"""
Expand Down Expand Up @@ -433,7 +435,7 @@ def from_ragged_zarr_matrix(cls,
store_path,
overwrite=False,
delete_original=False,
dtype='int8',
dtype='int16',
compressor_name='zstd',
compression_level=7):
"""
Expand Down Expand Up @@ -1322,7 +1324,7 @@ def estimate_extremal_eigenvalues(self,
def get_lambda_min(self, aggregate=None, min_max_ratio=0.):
"""
A utility method to compute the `lambda_min` value for the LD matrix. `lambda_min` is the smallest
eigenvalue of the LD matrix and this quantity can be useful to know about in some applications.
algebraic eigenvalue of the LD matrix. This quantity is useful to know in some applications.
The function retrieves minimum eigenvalue (if pre-computed and stored) per block and maps it
to each variant in the corresponding block. If minimum eigenvalues per block are not available,
we use global minimum eigenvalue (either from matrix attributes or we compute it on the spot).
Expand All @@ -1331,20 +1333,23 @@ def get_lambda_min(self, aggregate=None, min_max_ratio=0.):
abs(min(lambda_min, 0.))
This implies that if the minimum eigenvalue is positive, we just return 0. for `lambda_min`. We are mainly
This implies that if the minimum eigenvalue is non-negative, we just return 0. for `lambda_min`. We are mainly
interested in negative eigenvalues here (if they exist).
:param aggregate: A summary of the minimum eigenvalue across variants or across blocks (if available).
Supported aggregation functions are `mean_block`, `median_block`, `min_block`, and `min`. If `min` is selected,
we return the minimum eigenvalue for the entire matrix (rather than sub-blocks of it).
Supported aggregation functions are `min_block` and `min`. If `min` is selected,
we return the minimum eigenvalue for the entire matrix (rather than sub-blocks of it). If `min_block` is
selected, we return the minimum eigenvalue for each block separately (mapped to variants within that block).
:param min_max_ratio: The ratio between the absolute values of the minimum and maximum eigenvalues.
This could be used to target a particular threshold for the minimum eigenvalue.
:return: The `lambda_min` value for the LD matrix.
:return: The absolute value of the minimum eigenvalue for the LD matrix. If the minimum
eigenvalue is non-negative, we return zero.
"""

if aggregate is not None:
assert aggregate in ('mean_block', 'median_block', 'min_block', 'min')
assert aggregate in ('min_block', 'min')

# Get the attributes of the LD store:
store_attrs = self.list_store_attributes()
Expand All @@ -1365,7 +1370,7 @@ def threshold_lambda_min(eigs):

spectral_props = self.get_store_attr('Spectral properties')

if aggregate in ('mean_block', 'median_block', 'min_block'):
if aggregate == 'min_block':
assert 'Eigenvalues per block' in spectral_props, (
'Aggregating lambda_min across blocks '
'requires that these blocks are pre-defined.')
Expand Down Expand Up @@ -1406,10 +1411,6 @@ def threshold_lambda_min(eigs):
if self._mask is not None:
lambda_min = lambda_min[self._mask]

elif aggregate == 'mean_block':
lambda_min = np.mean(block_eigs['min'])
elif aggregate == 'median_block':
lambda_min = np.median(block_eigs['min'])
elif aggregate == 'min_block':
lambda_min = np.min(block_eigs['min'])

Expand All @@ -1433,6 +1434,30 @@ def estimate_uncompressed_size(self, dtype=None):

return 2.*self._zg['matrix/data'].shape[0]*np.dtype(dtype).itemsize / 1024 ** 2

def get_total_stored_bytes(self):
"""
Estimate the storage size for all elements of the `LDMatrix` hierarchy,
including the LD data arrays, metadata arrays, and attributes.
:return: The estimated size of the stored and compressed LDMatrix object in bytes.
"""

total_bytes = 0

# Estimate contribution of matrix arrays
for arr_name, array in self.zarr_group.matrix.arrays():
total_bytes += array.nbytes_stored

# Estimate contribution of metadata arrays
for arr_name, array in self.zarr_group.metadata.arrays():
total_bytes += array.nbytes_stored

# Estimate the contribution of the attributes:
if hasattr(self.zarr_group, 'attrs'):
total_bytes += len(str(dict(self.zarr_group.attrs)).encode('utf-8'))

return total_bytes

def get_metadata(self, key, apply_mask=True):
"""
Get the metadata associated with each variant in the LD matrix.
Expand Down
2 changes: 1 addition & 1 deletion magenpy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from .utils.data_utils import *

__version__ = '0.1.4'
__release_date__ = 'October 2024'
__release_date__ = 'December 2024'


config = configparser.ConfigParser()
Expand Down
9 changes: 4 additions & 5 deletions magenpy/stats/ld/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def compute(self,
output_dir,
overwrite=True,
delete_original=True,
dtype='int8',
dtype='int16',
compressor_name='zstd',
compression_level=7,
compute_spectral_properties=False) -> LDMatrix:
Expand Down Expand Up @@ -341,7 +341,7 @@ def compute(self,
output_dir,
overwrite=True,
delete_original=True,
dtype='int8',
dtype='int16',
compressor_name='zstd',
compression_level=7,
compute_spectral_properties=False) -> LDMatrix:
Expand Down Expand Up @@ -425,7 +425,6 @@ def compute(self,
n_snps_after = ld_mat.n_snps

if n_snps_after < n_snps_before:

spectral_prop['Extremal (excluding LRLD)'] = ld_mat.estimate_extremal_eigenvalues()

# Update or set the spectral properties attribute:
Expand Down Expand Up @@ -510,7 +509,7 @@ def compute(self,
output_dir,
overwrite=True,
delete_original=True,
dtype='int8',
dtype='int16',
compressor_name='zstd',
compression_level=7,
compute_spectral_properties=False,
Expand Down Expand Up @@ -694,7 +693,7 @@ def compute(self,
output_dir,
overwrite=True,
delete_original=True,
dtype='int8',
dtype='int16',
compressor_name='zstd',
compression_level=7,
compute_spectral_properties=False) -> LDMatrix:
Expand Down
6 changes: 3 additions & 3 deletions magenpy/stats/ld/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ def harmonic_series_sum(n):
return ld_mat_obj


def estimate_rows_per_chunk(rows, cols, dtype='int8', mem_size=128):
def estimate_rows_per_chunk(rows, cols, dtype='int16', mem_size=128):
"""
Estimate the number of rows per chunk for matrices conditional on the desired size of the chunk in MB.
The estimator takes as input the number of rows, columns, data type, and projected size of the chunk in memory.
Expand All @@ -426,7 +426,7 @@ def compute_ld_plink1p9(genotype_matrix,
trim_boundaries=False,
temp_dir='temp',
overwrite=True,
dtype='int8',
dtype='int16',
compressor_name='zstd',
compression_level=7):

Expand Down Expand Up @@ -539,7 +539,7 @@ def compute_ld_xarray(genotype_matrix,
temp_dir='temp',
overwrite=True,
delete_original=True,
dtype='int8',
dtype='int16',
compressor_name='zstd',
compression_level=7):

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
dask<=2024.1.0 # Seen installation issues with newer versions
scipy
xarray<=2024.7.0
numpy<2
pandas
pandas-plink
Expand Down

0 comments on commit 984919b

Please sign in to comment.