Skip to content

Commit

Permalink
Merge pull request #76 from apriha/develop
Browse files Browse the repository at this point in the history
v1.2.0
  • Loading branch information
apriha committed Jun 5, 2020
2 parents 1ed0a1f + 5a1a6bf commit 3ef1699
Show file tree
Hide file tree
Showing 34 changed files with 4,454 additions and 2,983 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ tests/resources/*
!tests/resources/gsa_rsid_map.txt
tests/input/23andme.txt.zip
tests/input/discrepant_snps[12].csv
tests/input/empty.txt
tests/input/ftdna.csv.gz
tests/input/generic.fa.gz
tests/input/testvcf.vcf.gz
27 changes: 9 additions & 18 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,35 +9,26 @@ language: python

before_install:
- pip install --upgrade pip setuptools wheel
- pip install pytest-cov codecov awscli
- pip install pytest-cov codecov

install:
- pip install .

script:
# for testing, limit downloads from the resource servers by using cached resources;
# note that the master branch is tested weekly via `cron`, so this ensures all Python
# versions will be periodically integration tested with the resource servers
# for testing, limit downloads from the resource servers to only the selected job for
# PRs and the master branch; note that the master branch is tested weekly via `cron`,
# so this ensures all Python versions will be periodically integration tested with the
# resource servers
- set -e
- NUM_JOBS=4
- SELECTED_JOB=$((10#$(date +%V) % $NUM_JOBS)) # identify a job based on week of the year
- DOWNLOADS_ENABLED=false
- |
if [[ $TRAVIS_PULL_REQUEST != "false" ]]; then
# download resources for all jobs on a pull request
DOWNLOADS_ENABLED=true
if [[ $TRAVIS_PULL_REQUEST != "false" && $SELECTED_JOB == $JOB_ID ]]; then
# download resources for selected job on a pull request
export DOWNLOADS_ENABLED=true
elif [[ $TRAVIS_BRANCH == "master" && $SELECTED_JOB == $JOB_ID ]]; then
# download resources for selected job on master branch
DOWNLOADS_ENABLED=true
fi
- |
if [[ $DOWNLOADS_ENABLED == "false" ]]; then
# use cached resources on Amazon S3
aws s3 cp s3://snps-resources/resources.tar.gz resources.tar.gz
if [[ -f resources.tar.gz ]]; then
tar -xzf resources.tar.gz
rm resources.tar.gz
fi
export DOWNLOADS_ENABLED=true
fi
- pytest --cov=snps tests
- |
Expand Down
15 changes: 15 additions & 0 deletions analysis/parse-opensnp-files/README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
parse-opensnp-files
===================
scripts to load and debug parsing of openSNP datadump files

Method
------
Attempt to parse each file in the `openSNP <https://opensnp.org>`_ datadump by creating a
``SNPs`` object. For files where SNPs were loaded, save summary statistics to a dataframe and
output as a CSV. For files where no SNPs were loaded, save a message for each file indicating
the issue and optionally extract these files from the datadump for debugging.

Results
-------
As of May 2020, ``snps`` can parse ~96.6% of the genotype files in the datadump. Additionally,
``snps`` can detect the build in ~99.9% of those files.
21 changes: 21 additions & 0 deletions analysis/parse-opensnp-files/get_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
""" Get a file from the openSNP datadump for debugging. """

import os

from atomicwrites import atomic_write

from snps.resources import Resources
from snps.utils import create_dir

OUTPUT_DIR = "output"
FILE = "user662_file340_yearofbirth_unknown_sex_unknown.23andme.txt"

if __name__ == "__main__":
# create output directory for this example
create_dir(OUTPUT_DIR)

# assume script is being run from examples dir
r = Resources(resources_dir="../../resources")

with atomic_write(os.path.join(OUTPUT_DIR, FILE), mode="wb") as f:
f.write(r.load_opensnp_datadump_file(FILE))
141 changes: 141 additions & 0 deletions analysis/parse-opensnp-files/parse_opensnp_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
""" Parse openSNP datadump files.
Attempt to parse each file in the openSNP datadump. For files where SNPs were loaded,
save summary statistics to a dataframe and output as a CSV. For files where no SNPs were
loaded, save a message for each file indicating the issue and optionally extract these
files from the datadump for debugging.
"""

import logging
import os
import random

from atomicwrites import atomic_write
import pandas as pd

from snps import SNPs
from snps.resources import Resources
from snps.utils import Parallelizer, save_df_as_csv, create_dir, clean_str

OUTPUT_DIR = "output"
EXTRACT_FILES = True

# create output directory for this example
create_dir(OUTPUT_DIR)

# assume script is being run from examples dir
r = Resources(resources_dir="../../resources")

# setup logger to output to file in output directory
logging.basicConfig(
filename="{}".format(os.path.join(OUTPUT_DIR, "parse-opensnp-files.txt")),
format="%(asctime)s: %(message)s",
filemode="w",
level=logging.INFO,
)

logger = logging.getLogger()


def load_file(task):
file = task["file"]

try:
s = SNPs(r.load_opensnp_datadump_file(file), assign_par_snps=False)
except Exception as err:
return {"msg": str(err).strip()[:100], "file": file}

if s.snp_count != 0:
d = s.get_summary()
d.update({"file": file})
return d
else:
return {"msg": "no SNPs processed", "file": file}


def main():
logger.info("start")

# get filenames from openSNP data dump
filenames = r.get_opensnp_datadump_filenames()

filenames = [
filename
for filename in filenames
if "readme" not in filename and "phenotype" not in filename
]

# draw a sample from the observations
random.seed(1)
SAMPLE_SIZE = len(filenames)
# SAMPLE_SIZE = 10
samples = random.sample(range(len(filenames)), SAMPLE_SIZE)

# setup tasks for parallelizing / execution on multiple cores
p = Parallelizer(parallelize=True)
tasks = [{"file": filenames[i]} for i in samples]

# run tasks; results is a list of dicts
results = p(load_file, tasks)

# get results from `load_file` where `snp_count` was non-zero
rows = [item for item in results if "msg" not in item]

df = pd.DataFrame(
rows,
columns=[
"file",
"source",
"build",
"build_detected",
"chromosomes",
"snp_count",
],
)

save_df_as_csv(df, OUTPUT_DIR, "parse-opensnp-files.csv")

# log parsing statistics
file_count = len(filenames)
logger.info("{} files in the openSNP datadump".format(file_count))
logger.info("{:.2%} of openSNP datadump files parsed".format(len(df) / file_count))
logger.info(
"build detected in {:.2%} of files parsed".format(
len(df.loc[df.build_detected]) / len(df)
)
)

# extract files from the datadump where `load_file` returned a message
if EXTRACT_FILES:
# group files with same message (e.g., {"some message": ["file1", "file2"], ...})
d = {}
for result in results:
if "msg" in result:
if result["msg"] in d:
d[result["msg"]].append(result["file"])
else:
d[result["msg"]] = [result["file"]]

# add messages / file filters as necessary...
d["build not detected"] = list(df.loc[~df.build_detected].file.values)

# extract files that have messages for debugging
for msg, files in d.items():
if len(files) == 0:
continue

# create a directory for each message (prefix indicates number of files)
path = os.path.join(
OUTPUT_DIR, "{:04}_{}".format(len(files), clean_str(msg))
)
create_dir(path)
# save each file with message into created directory
for filename in files:
with atomic_write(os.path.join(path, filename), mode="wb") as f:
f.write(r.load_opensnp_datadump_file(filename))

logger.info("stop")


if __name__ == "__main__":
main()
Loading

0 comments on commit 3ef1699

Please sign in to comment.