Skip to content

Commit

Permalink
Release 0.1.1. Minor updates / bug fixes.
Browse files Browse the repository at this point in the history
  • Loading branch information
shz9 committed Apr 12, 2024
1 parent 9aee88c commit e868877
Show file tree
Hide file tree
Showing 8 changed files with 89 additions and 34 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.1.1] - 2024-04-12

### Changed

- Fixed bugs in how covariates are processed in `SampleTable`.
- Fixed bugs / issues in implementation of GWAS with `xarray` backend.
- Streamlined implementation of `manhattan` plotting function.

## [0.1.0] - 2024-04-04

A large scale restructuring of the code base to improve efficiency and usability.
Expand Down
7 changes: 7 additions & 0 deletions magenpy/GenotypeMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,13 @@ def filter_samples(self, keep_samples=None, keep_file=None):

self.sample_table.filter_samples(keep_samples=keep_samples, keep_file=keep_file)

# IMPORTANT: After filtering samples, update SNP attributes that depend on the
# samples, such as MAF and N:
if 'N' in self.snp_table:
self.compute_sample_size_per_snp()
if 'MAF' in self.snp_table:
self.compute_allele_frequency()

def score(self, beta, standardize_genotype=False):
"""
Perform linear scoring, i.e. multiply the genotype matrix by the vector of effect sizes, `beta`.
Expand Down
6 changes: 3 additions & 3 deletions magenpy/SampleTable.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def read_covariates_file(self, covar_file, **read_csv_kwargs):
covar_table['FID'] = covar_table['FID'].astype(type(self.fid[0]))
covar_table['IID'] = covar_table['IID'].astype(type(self.iid[0]))

self.table = self.table.merge(covar_table)
self.table = self.table.merge(covar_table, on=['FID', 'IID'])
else:
self.table = covar_table

Expand Down Expand Up @@ -317,7 +317,7 @@ def get_covariates_table(self, covar_subset=None):
assert self._covariate_cols is not None

if covar_subset is None:
covar = self._covariate_cols
covar = list(self._covariate_cols)
else:
covar = list(set(self._covariate_cols).intersection(set(covar_subset)))

Expand All @@ -332,7 +332,7 @@ def get_covariates(self, covar_subset=None):
:return: A numpy array with the covariate values.
"""
return self.get_covariates_table(covar_subset=covar_subset).iloc[:, 2:].values
return self.get_covariates_table(covar_subset=covar_subset).drop(['FID', 'IID'], axis=1).values

def set_phenotype(self, phenotype, phenotype_likelihood=None):
"""
Expand Down
2 changes: 1 addition & 1 deletion magenpy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from .utils.data_utils import *

__version__ = '0.1.0'
__version__ = '0.1.1'
__release_date__ = 'April 2024'


Expand Down
43 changes: 30 additions & 13 deletions magenpy/plot/gwa.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,30 @@
def manhattan(input_data: Union[GWADataLoader, SumstatsTable],
y=None,
y_label=None,
scatter_kwargs=None,
#highlight_snps=None,
#highlight_snps_kwargs=None,
chrom_sep_color='#f0f0f0',
snp_color='#808080',
snp_marker='o',
snp_alpha=0.3,
add_bonf_line=True,
bonf_line_color='#b06a7a'):
bonf_line_kwargs=None):

"""
Generate Manhattan plot where the x-axis is the genomic position (in BP)
and the y-axis is the -log10(p-value) or some other statistic of the user's choice.
TODO: Add functionality to highlight certain SNPs or markers on the plot.
TODO: Allow the user to plot other statistics on the y-axis.
:param input_data: An instance of `SumstatsTable` or `GWADataLoader` from which data about the
positions of the SNPs will be extracted.
:param y: An optional vector of values to plot on the y-axis. If not provided, the -log10(p-value)
will be plotted by default.
:param y_label: A label for the quantity or statistic that will be plotted on the y-axis.
:param chrom_sep_color: The color for the chromosome separator block.
:param snp_color: The color of the dots on the Manhattan plot.
:param snp_marker: The shape of the marker on the Manhattan plot.
:param snp_alpha: The opacity level for the markers.
:param scatter_kwargs: A dictionary of keyword arguments to pass to the `plt.scatter` function.
This can be used to customize the appearance of the points on the scatter plot.
:param add_bonf_line: If True, add a line indicating the Bonferroni significance threshold.
:param bonf_line_color: The color of the Bonferroni significance threshold line.
:param bonf_line_kwargs: The color of the Bonferroni significance threshold line.
"""

Expand All @@ -42,6 +42,23 @@ def manhattan(input_data: Union[GWADataLoader, SumstatsTable],
else:
raise ValueError("The input data must be an instance of `SumstatsTable` or `GWADataLoader`.")

# -------------------------------------------------------
# Add custom scatter plot arguments (if not provided)
if scatter_kwargs is None:
scatter_kwargs = {'marker': '.', 'alpha': 0.3, 'color': '#808080'}
else:
# Only update the keys that are not already present in the dictionary:
scatter_kwargs = {**scatter_kwargs, **{'marker': '.', 'alpha': 0.3, 'color': '#808080'}}

# Add custom Bonferroni line arguments (if not provided)
if bonf_line_kwargs is None:
bonf_line_kwargs = {'color': '#b06a7a', 'ls': '--', 'zorder': 1}
else:
# Only update the keys that are not already present in the dictionary:
bonf_line_kwargs = {**bonf_line_kwargs, **{'color': '#b06a7a', 'ls': '--', 'zorder': 1}}

# -------------------------------------------------------

starting_pos = 0
ticks = []
chrom_spacing = .1*min([p.max() - p.min() for c, p in pos.items()])
Expand All @@ -52,8 +69,8 @@ def manhattan(input_data: Union[GWADataLoader, SumstatsTable],
# with -log10(p_value) on the Y-axis.

if add_bonf_line:
# Add bonferroni significance threshold line:
plt.axhline(-np.log10(0.05 / 1e6), ls='--', zorder=1, color=bonf_line_color)
# Add Bonferroni significance threshold line:
plt.axhline(-np.log10(0.05 / 1e6), bonf_line_kwargs)

if isinstance(input_data, SumstatsTable):
y = {c: ss.log10_p_value for c, ss in input_data.split_by_chromosome().items()}
Expand All @@ -76,9 +93,9 @@ def manhattan(input_data: Union[GWADataLoader, SumstatsTable],

ticks.append((xmin + xmax) / 2)

plt.scatter(pos[c] + starting_pos, y[c],
c=snp_color, alpha=snp_alpha, label=None,
marker=snp_marker)
plt.scatter(pos[c] + starting_pos,
y[c],
scatter_kwargs)

#if hl_snps is not None:
# plt.scatter((pos + starting_pos)[hl_snps[c]], y[c][hl_snps[c]],
Expand Down
54 changes: 38 additions & 16 deletions magenpy/stats/gwa/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@ def perform_gwa_plink2(genotype_matrix,
warnings.warn("The phenotype likelihood is not specified! "
"Assuming that the phenotype is continuous...")

# It can happen sometimes that some interfaces call this function
# with the `standardize_genotype` flag set to True. We remove this
# flag from the phenotype transformation kwargs to avoid errors:
if 'standardize_genotype' in phenotype_transform_kwargs:
del phenotype_transform_kwargs['standardize_genotype']

# Transform the phenotype:
phenotype, mask = chained_transform(s_table, **phenotype_transform_kwargs)

Expand Down Expand Up @@ -282,39 +288,55 @@ def perform_gwa_xarray(genotype_matrix,

# Get the SNP table from the genotype_matrix object:
sumstats_table = genotype_matrix.get_snp_table(
['CHR', 'SNP', 'POS', 'A1', 'A2', 'N', 'MAF']
['CHR', 'SNP', 'POS', 'A1', 'A2']
)

# -----------------------------------------------------------

# Transform the phenotype:
phenotype, mask = chained_transform(genotype_matrix.sample_table, **phenotype_transform_kwargs)

# TODO: Figure out how to adjust the per-variant sample size based on the mask!

# Estimate the phenotypic variance:
sigma_sq_y = np.var(phenotype)

# -----------------------------------------------------------
# Perform association testing using closed-form solutions:
# Prepare the genotype data for association testing:

# Apply the mask to the genotype matrix:
xr_mat = genotype_matrix.xr_mat[mask, :]

if standardize_genotype:
# Compute sample size per SNP:
n_per_snp = xr_mat.shape[0] - xr_mat.isnull().sum(axis=0).compute().values

from ..transforms.genotype import standardize
# Compute minor allele frequency per SNP:
maf = xr_mat.sum(axis=0).compute().values / (2 * n_per_snp)

sumstats_table['BETA'] = np.dot(standardize(xr_mat).T, phenotype) / sumstats_table['N'].values
sumstats_table['SE'] = np.sqrt(sigma_sq_y / sumstats_table['N'].values)
# Standardize or center the genotype matrix (account for missing values):
if standardize_genotype:
from ..transforms.genotype import standardize
xr_mat = standardize(xr_mat, fill_na=True)
else:
xr_mat = (xr_mat - 2.*maf).fillna(0.)

# Compute the sum of squares per SNP:
sum_x_sq = (xr_mat**2).sum(axis=0).compute().values

sumstats_table['BETA'] = (
np.dot(xr_mat.fillna(sumstats_table['MAF'].values).T, phenotype) /
sumstats_table['N'].values * genotype_matrix.maf_var
)
# -----------------------------------------------------------
# Compute quantities for association testing:

slope = np.dot(xr_mat.T, phenotype - phenotype.mean()) / sum_x_sq
intercept = phenotype.mean()

y_hat = xr_mat*slope + intercept

s2 = ((phenotype.reshape(-1, 1) - y_hat)**2).sum(axis=0) / (n_per_snp - 2)

se = np.sqrt(s2 / sum_x_sq)

# -----------------------------------------------------------
# Populate the data in the summary statistics table:

sumstats_table['SE'] = np.sqrt(sigma_sq_y / (sumstats_table['N'].values * genotype_matrix.maf_var))
sumstats_table['MAF'] = maf
sumstats_table['N'] = n_per_snp
sumstats_table['BETA'] = slope
sumstats_table['SE'] = se

ss_table = SumstatsTable(sumstats_table)
# Trigger computing z-score and p-values from the BETA and SE columns:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def no_cythonize(extensions, **_ignore):

setup(
name="magenpy",
version="0.1.0",
version="0.1.1",
author="Shadi Zabad",
author_email="shadi.zabad@mail.mcgill.ca",
description="Modeling and Analysis of Statistical Genetics data in python",
Expand Down
1 change: 1 addition & 0 deletions tests/conda_manual_testing.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ do
# Run pytest
python -m pytest -v

# Check the installed scripts
magenpy_ld -h
magenpy_simulate -h

Expand Down

0 comments on commit e868877

Please sign in to comment.