Skip to content

Commit

Permalink
Fixed bug in how LD window thresholds that are passed to plink are co…
Browse files Browse the repository at this point in the history
…mputed/processed.
  • Loading branch information
shz9 committed Jun 7, 2024
1 parent 8a47ae1 commit 227d08c
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 26 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
not work well for very large datasets with millions of variants and it causes overflow errors.
- Updated the way we determine the `pandas` chunksize when converting from `plink` tables to `zarr`.
- Simplified the way we compute the quantization scale in `model_utils`.
- Fixed major bug in how LD window thresholds that are passed to `plink1.9` are computed.

### Added

Expand Down
5 changes: 5 additions & 0 deletions magenpy/stats/ld/c_utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,11 @@ cpdef find_windowed_ld_boundaries(floating[:] pos, double max_dist):
Find the LD boundaries for the windowed estimator of LD, i.e., the
indices of the leftmost and rightmost neighbors for each SNP.
.. note::
To match plink's behavior, the bounds here are inclusive, i.e.,
if the distance between two SNPs is exactly equal to the maximum distance,
they are considered neighbors.
:param pos: A vector with the position of each genetic variant.
:param max_dist: The maximum distance between SNPs to consider them neighbors.
"""
Expand Down
59 changes: 59 additions & 0 deletions magenpy/stats/ld/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,60 @@ def compute_ld_boundaries(self):
m = self.genotype_matrix.n_snps
return np.array((np.zeros(m), np.ones(m)*m)).astype(np.int64)

def compute_plink_window_thresholds(self, ld_boundaries=None):
"""
Computes the LD window thresholds to pass to plink1.9 for computing LD matrices.
Unfortunately, plink1.9 sets some default values for the window size and it
is important to set all the thresholds to obtain the desired shape for the
LD matrix.
:param ld_boundaries: The LD boundaries for which to compute the thresholds. If not passed,
we compute the LD boundaries using the `compute_ld_boundaries` method.
:return: A dictionary containing the window size thresholds for plink1.9.
"""

if ld_boundaries is None:
ld_boundaries = self.compute_ld_boundaries()

threshold_dict = {}

# (1) Determine maximum window size (Maximum number of neighbors on each side):
try:
threshold_dict['window_size'] = getattr(self, "window_size")
assert threshold_dict['window_size'] is not None
except (AttributeError, AssertionError):
threshold_dict['window_size'] = np.abs(ld_boundaries -
np.arange(ld_boundaries.shape[1])).max()

# (2) Determine the maximum window size in kilobases + Centi Morgan (if available):

positional_bounds = np.array([ld_boundaries[0, :], ld_boundaries[1, :] - 1])

try:
threshold_dict['kb_window_size'] = getattr(self, "kb_window_size")
assert threshold_dict['kb_window_size'] is not None
except (AttributeError, AssertionError):
kb_pos = .001 * self.genotype_matrix.bp_pos
kb_bounds = kb_pos[positional_bounds]
threshold_dict['kb_window_size'] = np.abs(kb_bounds - kb_pos).max()

# (3) centi Morgan:
try:
threshold_dict['cm_window_size'] = getattr(self, "cm_window_size")
assert threshold_dict['cm_window_size'] is not None
except (AttributeError, AssertionError):
try:
# Checks if cm_pos is available in the genotype matrix:
cm_pos = self.genotype_matrix.cm_pos
cm_bounds = self.genotype_matrix.cm_pos[positional_bounds]
threshold_dict['cm_window_size'] = np.abs(cm_bounds - cm_pos).max()
except KeyError:
del threshold_dict['cm_window_size']

return threshold_dict

def compute(self,
output_dir,
temp_dir='temp',
Expand Down Expand Up @@ -100,9 +154,14 @@ def compute(self,
compressor_name=compressor_name,
compression_level=compression_level)
elif isinstance(self.genotype_matrix, plinkBEDGenotypeMatrix):

# Compute the window size thresholds to pass to plink 1.9:
window_size_thersh = self.compute_plink_window_thresholds(ld_boundaries)

ld_mat = compute_ld_plink1p9(self.genotype_matrix,
ld_boundaries,
output_dir,
window_size_thersh,
temp_dir=temp_dir,
overwrite=overwrite,
dtype=dtype,
Expand Down
33 changes: 7 additions & 26 deletions magenpy/stats/ld/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ def estimate_rows_per_chunk(rows, cols, dtype='int8', mem_size=128):
def compute_ld_plink1p9(genotype_matrix,
ld_boundaries,
output_dir,
window_size_thresholds,
temp_dir='temp',
overwrite=True,
dtype='int8',
Expand All @@ -263,8 +264,9 @@ def compute_ld_plink1p9(genotype_matrix,
Compute LD matrices using plink 1.9.
:param genotype_matrix: A plinkBEDGenotypeMatrix object
:param ld_boundaries: An array of LD boundaries for every SNP
:param ld_boundaries: An 2xM matrix of LD boundaries for every variant.
:param output_dir: The output directory for the final LD matrix file (after processing).
:param window_size_thresholds: A dictionary of the window size thresholds to pass to plink1.9.
:param temp_dir: A temporary directory to store intermediate files (e.g. files created for and by plink).
:param overwrite: If True, it overwrites any LD matrices in `output_dir`.
:param dtype: The data type for the entries of the LD matrix (supported data types are float32, float64
Expand All @@ -291,40 +293,19 @@ def compute_ld_plink1p9(genotype_matrix,

plink_output = osp.join(temp_dir, f'chr_{str(genotype_matrix.chromosome)}')

# Set the window sizes in various units:

# (1) Number of neighboring SNPs:
window_size = (ld_boundaries - np.arange(genotype_matrix.m)).max() + 10

# (2) Kilobases:
positional_bounds = np.clip(np.array([ld_boundaries[0, :] - 1, ld_boundaries[1, :]]),
a_min=0, a_max=ld_boundaries.shape[1] - 1)

kb_pos = .001*genotype_matrix.bp_pos
kb_bounds = kb_pos[positional_bounds]
kb_window_size = (kb_bounds - kb_pos).max() + .01

# (3) centi Morgan:
try:
cm_pos = genotype_matrix.cm_pos
cm_bounds = genotype_matrix.cm_pos[positional_bounds]
cm_window_size = (cm_bounds - cm_pos).max() + .01
except Exception:
cm_window_size = None

cmd = [
f"--bfile {genotype_matrix.bed_file.replace('.bed', '')}",
f"--keep {keep_file}",
f"--extract {snp_keepfile}",
"--keep-allele-order",
f"--out {plink_output}",
"--r gz",
f"--ld-window {window_size}",
f"--ld-window-kb {kb_window_size}"
f"--ld-window {window_size_thresholds['window_size']}",
f"--ld-window-kb {window_size_thresholds['kb_window_size']}"
]

if cm_window_size is not None:
cmd.append(f"--ld-window-cm {cm_window_size}")
if 'cm_window_size' in window_size_thresholds:
cmd.append(f"--ld-window-cm {window_size_thresholds['cm_window_size']}")

# ---------------------------------------------------------
# Test if plink1.9 version is compatible with setting the --ld-window-r2 flag:
Expand Down

0 comments on commit 227d08c

Please sign in to comment.