Skip to content

Commit

Permalink
Merge pull request #143 from arvkevi/feature/add_ancestry_predictions
Browse files Browse the repository at this point in the history
Basic ancestry functionality
  • Loading branch information
apriha committed Oct 22, 2021
2 parents 8ca5d75 + d29743f commit 2e8ccfe
Show file tree
Hide file tree
Showing 5 changed files with 218 additions and 0 deletions.
26 changes: 26 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,29 @@ jobs:
uses: codecov/codecov-action@v1
with:
fail_ci_if_error: true

test-extras:
needs: [test]
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest]
python-version: [3.7, 3.8, 3.9]

steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0
persist-credentials: false
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
pip install --upgrade pip setuptools wheel
pip install pytest-cov
pip install .[ezancestry]
- name: Test with pytest
run: |
pytest --cov=snps tests
10 changes: 10 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ Data Cleaning
- Deduplicate alleles on MT
- Assign PAR SNPs to the X or Y chromosome

Analysis
````````
- Derive sex from SNPs
- Predict ancestry from SNPs (when installed with `ezancestry <https://github.com/arvkevi/ezancestry>`_)

Supported Genotype Files
------------------------
``snps`` supports `VCF <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3137218/>`_ files and
Expand Down Expand Up @@ -72,6 +77,11 @@ Python dependencies) via ``pip``::

$ pip install snps

For `ancestry prediction <https://snps.readthedocs.io/en/stable/snps.html#snps.snps.SNPs.predict_ancestry>`_
capability, ``snps`` can be installed with `ezancestry <https://github.com/arvkevi/ezancestry>`_::

$ pip install snps[ezancestry]

Examples
--------
Download Example Data
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@
},
keywords="snps dna chromosomes bioinformatics vcf",
install_requires=["numpy", "pandas!=1.0.0,!=1.1.0", "atomicwrites"],
extras_require={"ezancestry": ["ezancestry"]},
python_requires=">=3.7.1",
platforms=["any"],
)
108 changes: 108 additions & 0 deletions src/snps/snps.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def __init__(

if deduplicate_MT_chrom:
self._deduplicate_MT_chrom()

else:
logger.warning("no SNPs loaded...")

Expand Down Expand Up @@ -1611,3 +1612,110 @@ def is_valid(self):
DeprecationWarning,
)
return self.valid

def predict_ancestry(
self,
output_directory=None,
write_predictions=False,
models_directory=None,
aisnps_directory=None,
n_components=None,
k=None,
thousand_genomes_directory=None,
samples_directory=None,
algorithm=None,
aisnps_set=None,
):
""" Predict genetic ancestry for SNPs.
Predictions by `ezancestry <https://github.com/arvkevi/ezancestry>`_.
Notes
-----
Populations below are described `here <https://www.internationalgenome.org/faq/what-do-the-population-codes-mean/>`_.
Parameters
----------
various : optional
See the available settings for `predict` at `ezancestry <https://github.com/arvkevi/ezancestry>`_.
Returns
-------
dict
dict with the following keys:
`population_code` (str)
max predicted population for the sample
`population_description` (str)
descriptive name of the population
`population_percent` (float)
predicted probability for the max predicted population
`superpopulation_code` (str)
max predicted super population (continental) for the sample
`superpopulation_description` (str)
descriptive name of the super population
`superpopulation_percent` (float)
predicted probability for the max predicted super population
`ezancestry_df` (pandas.DataFrame)
pandas.DataFrame with the following columns:
`component1`, `component2`, `component3`
The coordinates of the sample in the dimensionality-reduced component space. Can be
used as (x, y, z,) coordinates for plotting in a 3d scatter plot.
`predicted_population_population`
The max predicted population for the sample.
`ACB`, `ASW`, `BEB`, `CDX`, `CEU`, `CHB`, `CHS`, `CLM`, `ESN`, `FIN`, `GBR`, `GIH`, `GWD`, `IBS`, `ITU`, `JPT`, `KHV`, `LWK`, `MSL`, `MXL`, `PEL`, `PJL`, `PUR`, `STU`, `TSI`, `YRI`
Predicted probabilities for each of the populations. These sum to 1.0.
`predicted_population_superpopulation`
The max predicted super population (continental) for the sample.
`AFR`, `AMR`, `EAS`, `EUR`, `SAS`
Predicted probabilities for each of the super populations. These sum to 1.0.
`population_description`, `superpopulation_name`
Descriptive names of the population and super population.
"""
if not self.valid:
return {}

try:
from ezancestry.commands import predict
except ModuleNotFoundError:
raise ModuleNotFoundError(
"Ancestry prediction requires the ezancestry package; please install it using pip install ezancestry"
)

def max_pop(row):
popcode = row["predicted_population_population"]
popdesc = row["population_description"]
poppct = row[popcode]
superpopcode = row["predicted_population_superpopulation"]
superpopdesc = row["superpopulation_name"]
superpoppct = row[superpopcode]

return {
"population_code": popcode,
"population_description": popdesc,
"population_percent": poppct,
"superpopulation_code": superpopcode,
"superpopulation_description": superpopdesc,
"superpopulation_percent": superpoppct,
}

predictions = predict(
self.snps,
output_directory,
write_predictions,
models_directory,
aisnps_directory,
n_components,
k,
thousand_genomes_directory,
samples_directory,
algorithm,
aisnps_set,
)

d = dict(predictions.apply(max_pop, axis=1).iloc[0])
d["ezancestry_df"] = predictions

return d
73 changes: 73 additions & 0 deletions tests/test_snps.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,10 @@
"""

import importlib.util
import io
import os
import sys
import tempfile
from unittest.mock import Mock, patch
import warnings
Expand Down Expand Up @@ -458,6 +460,77 @@ def test_count_no_snps(self):
self.assertEqual(snps.count, 0)
self.assertTrue(snps.snps.empty)

def _make_ancestry_assertions(self, d):
self.assertEqual(d["population_code"], "ITU")
self.assertEqual(d["population_description"], "Indian Telugu in the UK")
self.assertAlmostEqual(d["population_percent"], 0.2992757864426246)
self.assertEqual(d["superpopulation_code"], "SAS")
self.assertEqual(d["superpopulation_description"], "South Asian Ancestry")
self.assertAlmostEqual(d["superpopulation_percent"], 0.827977563875996)
self.assertTrue("predicted_population_population" in d["ezancestry_df"].keys())
self.assertTrue(
"predicted_population_superpopulation" in d["ezancestry_df"].keys()
)

def test_ancestry(self):
def pop_modules(modules):
d = {}
for m in modules:
if m in sys.modules:
d[m] = sys.modules.pop(m)
return d

if importlib.util.find_spec("ezancestry") is not None:
# test with ezancestry if installed
s = SNPs("tests/input/generic.csv")
self._make_ancestry_assertions(s.predict_ancestry())

ezancestry_mods = ["ezancestry", "ezancestry.commands"]
popped_mods = pop_modules(ezancestry_mods)

# mock ezancestry modules
for mod in ezancestry_mods:
sys.modules[mod] = Mock()

sys.modules["ezancestry.commands"].predict = Mock(
return_value=pd.DataFrame(
{
"predicted_population_population": ["ITU"],
"population_description": ["Indian Telugu in the UK"],
"ITU": [0.2992757864426246],
"predicted_population_superpopulation": ["SAS"],
"superpopulation_name": ["South Asian Ancestry"],
"SAS": [0.827977563875996],
}
)
)

# test with mocked ezancestry
s = SNPs("tests/input/generic.csv")
self._make_ancestry_assertions(s.predict_ancestry())

# unload mocked ezancestry modules
pop_modules(ezancestry_mods)

# restore ezancestry modules if ezancestry installed
sys.modules.update(popped_mods)

def test_ancestry_module_not_found_error(self):
if importlib.util.find_spec("ezancestry") is None:
# test when ezancestry not installed
s = SNPs("tests/input/generic.csv")
with self.assertRaises(ModuleNotFoundError) as err:
_ = s.predict_ancestry()

self.assertEqual(
err.exception.msg,
"Ancestry prediction requires the ezancestry package; please install it using pip install ezancestry",
)

def test_ancestry_no_snps(self):
for snps in self.empty_snps():
self.assertDictEqual(snps.predict_ancestry(), {})


class TestSNPsMerge(TestSnps):
def assert_results(self, results, expected_results):
Expand Down

0 comments on commit 2e8ccfe

Please sign in to comment.