Merge pull request #143 from arvkevi/feature/add_ancestry_predictions

Basic ancestry functionality
apriha · Oct 22, 2021 · 2e8ccfe · 2e8ccfe
2 parents 8ca5d75 + d29743f
commit 2e8ccfe
Show file tree

Hide file tree

Showing 5 changed files with 218 additions and 0 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -111,3 +111,29 @@ jobs:
       uses: codecov/codecov-action@v1
       with:
         fail_ci_if_error: true
+
+  test-extras:
+    needs: [test]
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        python-version: [3.7, 3.8, 3.9]
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        fetch-depth: 0
+        persist-credentials: false
+    - name: Setup Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        pip install --upgrade pip setuptools wheel
+        pip install pytest-cov
+        pip install .[ezancestry]
+    - name: Test with pytest
+      run: |
+        pytest --cov=snps tests
diff --git a/README.rst b/README.rst
@@ -36,6 +36,11 @@ Data Cleaning
 - Deduplicate alleles on MT
 - Assign PAR SNPs to the X or Y chromosome
 
+Analysis
+````````
+- Derive sex from SNPs
+- Predict ancestry from SNPs (when installed with `ezancestry <https://github.com/arvkevi/ezancestry>`_)
+
 Supported Genotype Files
 ------------------------
 ``snps`` supports `VCF <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3137218/>`_ files and
@@ -72,6 +77,11 @@ Python dependencies) via ``pip``::
 
     $ pip install snps
 
+For `ancestry prediction <https://snps.readthedocs.io/en/stable/snps.html#snps.snps.SNPs.predict_ancestry>`_
+capability, ``snps`` can be installed with `ezancestry <https://github.com/arvkevi/ezancestry>`_::
+
+    $ pip install snps[ezancestry]
+
 Examples
 --------
 Download Example Data

diff --git a/setup.py b/setup.py
@@ -119,6 +119,7 @@
     },
     keywords="snps dna chromosomes bioinformatics vcf",
     install_requires=["numpy", "pandas!=1.0.0,!=1.1.0", "atomicwrites"],
+    extras_require={"ezancestry": ["ezancestry"]},
     python_requires=">=3.7.1",
     platforms=["any"],
 )
diff --git a/src/snps/snps.py b/src/snps/snps.py
@@ -163,6 +163,7 @@ def __init__(
 
                 if deduplicate_MT_chrom:
                     self._deduplicate_MT_chrom()
+
             else:
                 logger.warning("no SNPs loaded...")
 
@@ -1611,3 +1612,110 @@ def is_valid(self):
             DeprecationWarning,
         )
         return self.valid
+
+    def predict_ancestry(
+        self,
+        output_directory=None,
+        write_predictions=False,
+        models_directory=None,
+        aisnps_directory=None,
+        n_components=None,
+        k=None,
+        thousand_genomes_directory=None,
+        samples_directory=None,
+        algorithm=None,
+        aisnps_set=None,
+    ):
+        """ Predict genetic ancestry for SNPs.
+
+        Predictions by `ezancestry <https://github.com/arvkevi/ezancestry>`_.
+
+        Notes
+        -----
+        Populations below are described `here <https://www.internationalgenome.org/faq/what-do-the-population-codes-mean/>`_.
+
+        Parameters
+        ----------
+        various : optional
+            See the available settings for `predict` at `ezancestry <https://github.com/arvkevi/ezancestry>`_.
+
+        Returns
+        -------
+        dict
+            dict with the following keys:
+
+            `population_code` (str)
+              max predicted population for the sample
+            `population_description` (str)
+              descriptive name of the population
+            `population_percent` (float)
+              predicted probability for the max predicted population
+            `superpopulation_code` (str)
+              max predicted super population (continental) for the sample
+            `superpopulation_description` (str)
+              descriptive name of the super population
+            `superpopulation_percent` (float)
+              predicted probability for the max predicted super population
+            `ezancestry_df` (pandas.DataFrame)
+              pandas.DataFrame with the following columns:
+
+              `component1`, `component2`, `component3`
+                The coordinates of the sample in the dimensionality-reduced component space. Can be
+                used as (x, y, z,) coordinates for plotting in a 3d scatter plot.
+              `predicted_population_population`
+                The max predicted population for the sample.
+              `ACB`, `ASW`, `BEB`, `CDX`, `CEU`, `CHB`, `CHS`, `CLM`, `ESN`, `FIN`, `GBR`, `GIH`, `GWD`, `IBS`, `ITU`, `JPT`, `KHV`, `LWK`, `MSL`, `MXL`, `PEL`, `PJL`, `PUR`, `STU`, `TSI`, `YRI`
+                Predicted probabilities for each of the populations. These sum to 1.0.
+              `predicted_population_superpopulation`
+                The max predicted super population (continental) for the sample.
+              `AFR`, `AMR`, `EAS`, `EUR`, `SAS`
+                Predicted probabilities for each of the super populations. These sum to 1.0.
+              `population_description`, `superpopulation_name`
+                Descriptive names of the population and super population.
+
+        """
+        if not self.valid:
+            return {}
+
+        try:
+            from ezancestry.commands import predict
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError(
+                "Ancestry prediction requires the ezancestry package; please install it using pip install ezancestry"
+            )
+
+        def max_pop(row):
+            popcode = row["predicted_population_population"]
+            popdesc = row["population_description"]
+            poppct = row[popcode]
+            superpopcode = row["predicted_population_superpopulation"]
+            superpopdesc = row["superpopulation_name"]
+            superpoppct = row[superpopcode]
+
+            return {
+                "population_code": popcode,
+                "population_description": popdesc,
+                "population_percent": poppct,
+                "superpopulation_code": superpopcode,
+                "superpopulation_description": superpopdesc,
+                "superpopulation_percent": superpoppct,
+            }
+
+        predictions = predict(
+            self.snps,
+            output_directory,
+            write_predictions,
+            models_directory,
+            aisnps_directory,
+            n_components,
+            k,
+            thousand_genomes_directory,
+            samples_directory,
+            algorithm,
+            aisnps_set,
+        )
+
+        d = dict(predictions.apply(max_pop, axis=1).iloc[0])
+        d["ezancestry_df"] = predictions
+
+        return d
diff --git a/tests/test_snps.py b/tests/test_snps.py
@@ -31,8 +31,10 @@
 
 """
 
+import importlib.util
 import io
 import os
+import sys
 import tempfile
 from unittest.mock import Mock, patch
 import warnings
@@ -458,6 +460,77 @@ def test_count_no_snps(self):
             self.assertEqual(snps.count, 0)
             self.assertTrue(snps.snps.empty)
 
+    def _make_ancestry_assertions(self, d):
+        self.assertEqual(d["population_code"], "ITU")
+        self.assertEqual(d["population_description"], "Indian Telugu in the UK")
+        self.assertAlmostEqual(d["population_percent"], 0.2992757864426246)
+        self.assertEqual(d["superpopulation_code"], "SAS")
+        self.assertEqual(d["superpopulation_description"], "South Asian Ancestry")
+        self.assertAlmostEqual(d["superpopulation_percent"], 0.827977563875996)
+        self.assertTrue("predicted_population_population" in d["ezancestry_df"].keys())
+        self.assertTrue(
+            "predicted_population_superpopulation" in d["ezancestry_df"].keys()
+        )
+
+    def test_ancestry(self):
+        def pop_modules(modules):
+            d = {}
+            for m in modules:
+                if m in sys.modules:
+                    d[m] = sys.modules.pop(m)
+            return d
+
+        if importlib.util.find_spec("ezancestry") is not None:
+            # test with ezancestry if installed
+            s = SNPs("tests/input/generic.csv")
+            self._make_ancestry_assertions(s.predict_ancestry())
+
+        ezancestry_mods = ["ezancestry", "ezancestry.commands"]
+        popped_mods = pop_modules(ezancestry_mods)
+
+        # mock ezancestry modules
+        for mod in ezancestry_mods:
+            sys.modules[mod] = Mock()
+
+        sys.modules["ezancestry.commands"].predict = Mock(
+            return_value=pd.DataFrame(
+                {
+                    "predicted_population_population": ["ITU"],
+                    "population_description": ["Indian Telugu in the UK"],
+                    "ITU": [0.2992757864426246],
+                    "predicted_population_superpopulation": ["SAS"],
+                    "superpopulation_name": ["South Asian Ancestry"],
+                    "SAS": [0.827977563875996],
+                }
+            )
+        )
+
+        # test with mocked ezancestry
+        s = SNPs("tests/input/generic.csv")
+        self._make_ancestry_assertions(s.predict_ancestry())
+
+        # unload mocked ezancestry modules
+        pop_modules(ezancestry_mods)
+
+        # restore ezancestry modules if ezancestry installed
+        sys.modules.update(popped_mods)
+
+    def test_ancestry_module_not_found_error(self):
+        if importlib.util.find_spec("ezancestry") is None:
+            # test when ezancestry not installed
+            s = SNPs("tests/input/generic.csv")
+            with self.assertRaises(ModuleNotFoundError) as err:
+                _ = s.predict_ancestry()
+
+            self.assertEqual(
+                err.exception.msg,
+                "Ancestry prediction requires the ezancestry package; please install it using pip install ezancestry",
+            )
+
+    def test_ancestry_no_snps(self):
+        for snps in self.empty_snps():
+            self.assertDictEqual(snps.predict_ancestry(), {})
+
 
 class TestSNPsMerge(TestSnps):
     def assert_results(self, results, expected_results):