Skip to content

Commit

Permalink
Merge pull request #72 from apriha/enhance-file-parsing
Browse files Browse the repository at this point in the history
Enhance file parsing and build detection
  • Loading branch information
apriha committed Jun 5, 2020
2 parents 621ad0e + 5ea82b7 commit 5a1a6bf
Show file tree
Hide file tree
Showing 19 changed files with 836 additions and 97 deletions.
15 changes: 15 additions & 0 deletions analysis/parse-opensnp-files/README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
parse-opensnp-files
===================
scripts to load and debug parsing of openSNP datadump files

Method
------
Attempt to parse each file in the `openSNP <https://opensnp.org>`_ datadump by creating a
``SNPs`` object. For files where SNPs were loaded, save summary statistics to a dataframe and
output as a CSV. For files where no SNPs were loaded, save a message for each file indicating
the issue and optionally extract these files from the datadump for debugging.

Results
-------
As of May 2020, ``snps`` can parse ~96.6% of the genotype files in the datadump. Additionally,
``snps`` can detect the build in ~99.9% of those files.
21 changes: 21 additions & 0 deletions analysis/parse-opensnp-files/get_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
""" Get a file from the openSNP datadump for debugging. """

import os

from atomicwrites import atomic_write

from snps.resources import Resources
from snps.utils import create_dir

OUTPUT_DIR = "output"
FILE = "user662_file340_yearofbirth_unknown_sex_unknown.23andme.txt"

if __name__ == "__main__":
# create output directory for this example
create_dir(OUTPUT_DIR)

# assume script is being run from examples dir
r = Resources(resources_dir="../../resources")

with atomic_write(os.path.join(OUTPUT_DIR, FILE), mode="wb") as f:
f.write(r.load_opensnp_datadump_file(FILE))
141 changes: 141 additions & 0 deletions analysis/parse-opensnp-files/parse_opensnp_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
""" Parse openSNP datadump files.
Attempt to parse each file in the openSNP datadump. For files where SNPs were loaded,
save summary statistics to a dataframe and output as a CSV. For files where no SNPs were
loaded, save a message for each file indicating the issue and optionally extract these
files from the datadump for debugging.
"""

import logging
import os
import random

from atomicwrites import atomic_write
import pandas as pd

from snps import SNPs
from snps.resources import Resources
from snps.utils import Parallelizer, save_df_as_csv, create_dir, clean_str

OUTPUT_DIR = "output"
EXTRACT_FILES = True

# create output directory for this example
create_dir(OUTPUT_DIR)

# assume script is being run from examples dir
r = Resources(resources_dir="../../resources")

# setup logger to output to file in output directory
logging.basicConfig(
filename="{}".format(os.path.join(OUTPUT_DIR, "parse-opensnp-files.txt")),
format="%(asctime)s: %(message)s",
filemode="w",
level=logging.INFO,
)

logger = logging.getLogger()


def load_file(task):
file = task["file"]

try:
s = SNPs(r.load_opensnp_datadump_file(file), assign_par_snps=False)
except Exception as err:
return {"msg": str(err).strip()[:100], "file": file}

if s.snp_count != 0:
d = s.get_summary()
d.update({"file": file})
return d
else:
return {"msg": "no SNPs processed", "file": file}


def main():
logger.info("start")

# get filenames from openSNP data dump
filenames = r.get_opensnp_datadump_filenames()

filenames = [
filename
for filename in filenames
if "readme" not in filename and "phenotype" not in filename
]

# draw a sample from the observations
random.seed(1)
SAMPLE_SIZE = len(filenames)
# SAMPLE_SIZE = 10
samples = random.sample(range(len(filenames)), SAMPLE_SIZE)

# setup tasks for parallelizing / execution on multiple cores
p = Parallelizer(parallelize=True)
tasks = [{"file": filenames[i]} for i in samples]

# run tasks; results is a list of dicts
results = p(load_file, tasks)

# get results from `load_file` where `snp_count` was non-zero
rows = [item for item in results if "msg" not in item]

df = pd.DataFrame(
rows,
columns=[
"file",
"source",
"build",
"build_detected",
"chromosomes",
"snp_count",
],
)

save_df_as_csv(df, OUTPUT_DIR, "parse-opensnp-files.csv")

# log parsing statistics
file_count = len(filenames)
logger.info("{} files in the openSNP datadump".format(file_count))
logger.info("{:.2%} of openSNP datadump files parsed".format(len(df) / file_count))
logger.info(
"build detected in {:.2%} of files parsed".format(
len(df.loc[df.build_detected]) / len(df)
)
)

# extract files from the datadump where `load_file` returned a message
if EXTRACT_FILES:
# group files with same message (e.g., {"some message": ["file1", "file2"], ...})
d = {}
for result in results:
if "msg" in result:
if result["msg"] in d:
d[result["msg"]].append(result["file"])
else:
d[result["msg"]] = [result["file"]]

# add messages / file filters as necessary...
d["build not detected"] = list(df.loc[~df.build_detected].file.values)

# extract files that have messages for debugging
for msg, files in d.items():
if len(files) == 0:
continue

# create a directory for each message (prefix indicates number of files)
path = os.path.join(
OUTPUT_DIR, "{:04}_{}".format(len(files), clean_str(msg))
)
create_dir(path)
# save each file with message into created directory
for filename in files:
with atomic_write(os.path.join(path, filename), mode="wb") as f:
f.write(r.load_opensnp_datadump_file(filename))

logger.info("stop")


if __name__ == "__main__":
main()
Loading

0 comments on commit 5a1a6bf

Please sign in to comment.