Release 0.1.1. Minor updates / bug fixes.

shz9 · Apr 12, 2024 · e868877 · e868877
1 parent 9aee88c
commit e868877
Show file tree

Hide file tree

Showing 8 changed files with 89 additions and 34 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.1] - 2024-04-12
+
+### Changed
+
+- Fixed bugs in how covariates are processed in `SampleTable`.
+- Fixed bugs / issues in implementation of GWAS with `xarray` backend.
+- Streamlined implementation of `manhattan` plotting function.
+
 ## [0.1.0] - 2024-04-04
 
 A large scale restructuring of the code base to improve efficiency and usability.

diff --git a/magenpy/GenotypeMatrix.py b/magenpy/GenotypeMatrix.py
@@ -475,6 +475,13 @@ def filter_samples(self, keep_samples=None, keep_file=None):
 
         self.sample_table.filter_samples(keep_samples=keep_samples, keep_file=keep_file)
 
+        # IMPORTANT: After filtering samples, update SNP attributes that depend on the
+        # samples, such as MAF and N:
+        if 'N' in self.snp_table:
+            self.compute_sample_size_per_snp()
+        if 'MAF' in self.snp_table:
+            self.compute_allele_frequency()
+
     def score(self, beta, standardize_genotype=False):
         """
         Perform linear scoring, i.e. multiply the genotype matrix by the vector of effect sizes, `beta`.

diff --git a/magenpy/SampleTable.py b/magenpy/SampleTable.py
@@ -223,7 +223,7 @@ def read_covariates_file(self, covar_file, **read_csv_kwargs):
             covar_table['FID'] = covar_table['FID'].astype(type(self.fid[0]))
             covar_table['IID'] = covar_table['IID'].astype(type(self.iid[0]))
 
-            self.table = self.table.merge(covar_table)
+            self.table = self.table.merge(covar_table, on=['FID', 'IID'])
         else:
             self.table = covar_table
 
@@ -317,7 +317,7 @@ def get_covariates_table(self, covar_subset=None):
         assert self._covariate_cols is not None
 
         if covar_subset is None:
-            covar = self._covariate_cols
+            covar = list(self._covariate_cols)
         else:
             covar = list(set(self._covariate_cols).intersection(set(covar_subset)))
 
@@ -332,7 +332,7 @@ def get_covariates(self, covar_subset=None):
 
         :return: A numpy array with the covariate values.
         """
-        return self.get_covariates_table(covar_subset=covar_subset).iloc[:, 2:].values
+        return self.get_covariates_table(covar_subset=covar_subset).drop(['FID', 'IID'], axis=1).values
 
     def set_phenotype(self, phenotype, phenotype_likelihood=None):
         """

diff --git a/magenpy/__init__.py b/magenpy/__init__.py
@@ -16,7 +16,7 @@
 
 from .utils.data_utils import *
 
-__version__ = '0.1.0'
+__version__ = '0.1.1'
 __release_date__ = 'April 2024'
 
 

diff --git a/magenpy/plot/gwa.py b/magenpy/plot/gwa.py
@@ -8,30 +8,30 @@
 def manhattan(input_data: Union[GWADataLoader, SumstatsTable],
               y=None,
               y_label=None,
+              scatter_kwargs=None,
+              #highlight_snps=None,
+              #highlight_snps_kwargs=None,
               chrom_sep_color='#f0f0f0',
-              snp_color='#808080',
-              snp_marker='o',
-              snp_alpha=0.3,
               add_bonf_line=True,
-              bonf_line_color='#b06a7a'):
+              bonf_line_kwargs=None):
 
     """
     Generate Manhattan plot where the x-axis is the genomic position (in BP)
     and the y-axis is the -log10(p-value) or some other statistic of the user's choice.
 
     TODO: Add functionality to highlight certain SNPs or markers on the plot.
+    TODO: Allow the user to plot other statistics on the y-axis.
 
     :param input_data: An instance of `SumstatsTable` or `GWADataLoader` from which data about the
     positions of the SNPs will be extracted.
     :param y: An optional vector of values to plot on the y-axis. If not provided, the -log10(p-value)
     will be plotted by default.
     :param y_label: A label for the quantity or statistic that will be plotted on the y-axis.
     :param chrom_sep_color: The color for the chromosome separator block.
-    :param snp_color: The color of the dots on the Manhattan plot.
-    :param snp_marker: The shape of the marker on the Manhattan plot.
-    :param snp_alpha: The opacity level for the markers.
+    :param scatter_kwargs: A dictionary of keyword arguments to pass to the `plt.scatter` function.
+    This can be used to customize the appearance of the points on the scatter plot.
     :param add_bonf_line: If True, add a line indicating the Bonferroni significance threshold.
-    :param bonf_line_color: The color of the Bonferroni significance threshold line.
+    :param bonf_line_kwargs: The color of the Bonferroni significance threshold line.
 
     """
 
@@ -42,6 +42,23 @@ def manhattan(input_data: Union[GWADataLoader, SumstatsTable],
     else:
         raise ValueError("The input data must be an instance of `SumstatsTable` or `GWADataLoader`.")
 
+    # -------------------------------------------------------
+    # Add custom scatter plot arguments (if not provided)
+    if scatter_kwargs is None:
+        scatter_kwargs = {'marker': '.', 'alpha': 0.3, 'color': '#808080'}
+    else:
+        # Only update the keys that are not already present in the dictionary:
+        scatter_kwargs = {**scatter_kwargs, **{'marker': '.', 'alpha': 0.3, 'color': '#808080'}}
+
+    # Add custom Bonferroni line arguments (if not provided)
+    if bonf_line_kwargs is None:
+        bonf_line_kwargs = {'color': '#b06a7a', 'ls': '--', 'zorder': 1}
+    else:
+        # Only update the keys that are not already present in the dictionary:
+        bonf_line_kwargs = {**bonf_line_kwargs, **{'color': '#b06a7a', 'ls': '--', 'zorder': 1}}
+
+    # -------------------------------------------------------
+
     starting_pos = 0
     ticks = []
     chrom_spacing = .1*min([p.max() - p.min() for c, p in pos.items()])
@@ -52,8 +69,8 @@ def manhattan(input_data: Union[GWADataLoader, SumstatsTable],
         # with -log10(p_value) on the Y-axis.
 
         if add_bonf_line:
-            # Add bonferroni significance threshold line:
-            plt.axhline(-np.log10(0.05 / 1e6), ls='--', zorder=1, color=bonf_line_color)
+            # Add Bonferroni significance threshold line:
+            plt.axhline(-np.log10(0.05 / 1e6), bonf_line_kwargs)
 
         if isinstance(input_data, SumstatsTable):
             y = {c: ss.log10_p_value for c, ss in input_data.split_by_chromosome().items()}
@@ -76,9 +93,9 @@ def manhattan(input_data: Union[GWADataLoader, SumstatsTable],
 
         ticks.append((xmin + xmax) / 2)
 
-        plt.scatter(pos[c] + starting_pos, y[c],
-                    c=snp_color, alpha=snp_alpha, label=None,
-                    marker=snp_marker)
+        plt.scatter(pos[c] + starting_pos,
+                    y[c],
+                    scatter_kwargs)
 
         #if hl_snps is not None:
         #    plt.scatter((pos + starting_pos)[hl_snps[c]], y[c][hl_snps[c]],

diff --git a/magenpy/stats/gwa/utils.py b/magenpy/stats/gwa/utils.py
@@ -66,6 +66,12 @@ def perform_gwa_plink2(genotype_matrix,
         warnings.warn("The phenotype likelihood is not specified! "
                       "Assuming that the phenotype is continuous...")
 
+    # It can happen sometimes that some interfaces call this function
+    # with the `standardize_genotype` flag set to True. We remove this
+    # flag from the phenotype transformation kwargs to avoid errors:
+    if 'standardize_genotype' in phenotype_transform_kwargs:
+        del phenotype_transform_kwargs['standardize_genotype']
+
     # Transform the phenotype:
     phenotype, mask = chained_transform(s_table, **phenotype_transform_kwargs)
 
@@ -282,39 +288,55 @@ def perform_gwa_xarray(genotype_matrix,
 
     # Get the SNP table from the genotype_matrix object:
     sumstats_table = genotype_matrix.get_snp_table(
-        ['CHR', 'SNP', 'POS', 'A1', 'A2', 'N', 'MAF']
+        ['CHR', 'SNP', 'POS', 'A1', 'A2']
     )
 
     # -----------------------------------------------------------
 
     # Transform the phenotype:
     phenotype, mask = chained_transform(genotype_matrix.sample_table, **phenotype_transform_kwargs)
 
-    # TODO: Figure out how to adjust the per-variant sample size based on the mask!
-
-    # Estimate the phenotypic variance:
-    sigma_sq_y = np.var(phenotype)
-
     # -----------------------------------------------------------
-    # Perform association testing using closed-form solutions:
+    # Prepare the genotype data for association testing:
 
     # Apply the mask to the genotype matrix:
     xr_mat = genotype_matrix.xr_mat[mask, :]
 
-    if standardize_genotype:
+    # Compute sample size per SNP:
+    n_per_snp = xr_mat.shape[0] - xr_mat.isnull().sum(axis=0).compute().values
 
-        from ..transforms.genotype import standardize
+    # Compute minor allele frequency per SNP:
+    maf = xr_mat.sum(axis=0).compute().values / (2 * n_per_snp)
 
-        sumstats_table['BETA'] = np.dot(standardize(xr_mat).T, phenotype) / sumstats_table['N'].values
-        sumstats_table['SE'] = np.sqrt(sigma_sq_y / sumstats_table['N'].values)
+    # Standardize or center the genotype matrix (account for missing values):
+    if standardize_genotype:
+        from ..transforms.genotype import standardize
+        xr_mat = standardize(xr_mat, fill_na=True)
     else:
+        xr_mat = (xr_mat - 2.*maf).fillna(0.)
+
+    # Compute the sum of squares per SNP:
+    sum_x_sq = (xr_mat**2).sum(axis=0).compute().values
 
-        sumstats_table['BETA'] = (
-            np.dot(xr_mat.fillna(sumstats_table['MAF'].values).T, phenotype) /
-            sumstats_table['N'].values * genotype_matrix.maf_var
-        )
+    # -----------------------------------------------------------
+    # Compute quantities for association testing:
+
+    slope = np.dot(xr_mat.T, phenotype - phenotype.mean()) / sum_x_sq
+    intercept = phenotype.mean()
+
+    y_hat = xr_mat*slope + intercept
+
+    s2 = ((phenotype.reshape(-1, 1) - y_hat)**2).sum(axis=0) / (n_per_snp - 2)
+
+    se = np.sqrt(s2 / sum_x_sq)
+
+    # -----------------------------------------------------------
+    # Populate the data in the summary statistics table:
 
-        sumstats_table['SE'] = np.sqrt(sigma_sq_y / (sumstats_table['N'].values * genotype_matrix.maf_var))
+    sumstats_table['MAF'] = maf
+    sumstats_table['N'] = n_per_snp
+    sumstats_table['BETA'] = slope
+    sumstats_table['SE'] = se
 
     ss_table = SumstatsTable(sumstats_table)
     # Trigger computing z-score and p-values from the BETA and SE columns:

diff --git a/setup.py b/setup.py
@@ -79,7 +79,7 @@ def no_cythonize(extensions, **_ignore):
 
 setup(
     name="magenpy",
-    version="0.1.0",
+    version="0.1.1",
     author="Shadi Zabad",
     author_email="shadi.zabad@mail.mcgill.ca",
     description="Modeling and Analysis of Statistical Genetics data in python",

diff --git a/tests/conda_manual_testing.sh b/tests/conda_manual_testing.sh
@@ -37,6 +37,7 @@ do
     # Run pytest
     python -m pytest -v
 
+    # Check the installed scripts
     magenpy_ld -h
     magenpy_simulate -h