Merge pull request #72 from apriha/enhance-file-parsing

Enhance file parsing and build detection
apriha · Jun 5, 2020 · 5a1a6bf · 5a1a6bf
2 parents 621ad0e + 5ea82b7
commit 5a1a6bf
Show file tree

Hide file tree

Showing 19 changed files with 836 additions and 97 deletions.
diff --git a/analysis/parse-opensnp-files/README.rst b/analysis/parse-opensnp-files/README.rst
@@ -0,0 +1,15 @@
+parse-opensnp-files
+===================
+scripts to load and debug parsing of openSNP datadump files
+
+Method
+------
+Attempt to parse each file in the `openSNP <https://opensnp.org>`_ datadump by creating a
+``SNPs`` object. For files where SNPs were loaded, save summary statistics to a dataframe and
+output as a CSV. For files where no SNPs were loaded, save a message for each file indicating
+the issue and optionally extract these files from the datadump for debugging.
+
+Results
+-------
+As of May 2020, ``snps`` can parse ~96.6% of the genotype files in the datadump. Additionally,
+``snps`` can detect the build in ~99.9% of those files.
diff --git a/analysis/parse-opensnp-files/get_file.py b/analysis/parse-opensnp-files/get_file.py
@@ -0,0 +1,21 @@
+""" Get a file from the openSNP datadump for debugging. """
+
+import os
+
+from atomicwrites import atomic_write
+
+from snps.resources import Resources
+from snps.utils import create_dir
+
+OUTPUT_DIR = "output"
+FILE = "user662_file340_yearofbirth_unknown_sex_unknown.23andme.txt"
+
+if __name__ == "__main__":
+    # create output directory for this example
+    create_dir(OUTPUT_DIR)
+
+    # assume script is being run from examples dir
+    r = Resources(resources_dir="../../resources")
+
+    with atomic_write(os.path.join(OUTPUT_DIR, FILE), mode="wb") as f:
+        f.write(r.load_opensnp_datadump_file(FILE))
diff --git a/analysis/parse-opensnp-files/parse_opensnp_files.py b/analysis/parse-opensnp-files/parse_opensnp_files.py
@@ -0,0 +1,141 @@
+""" Parse openSNP datadump files.
+
+Attempt to parse each file in the openSNP datadump. For files where SNPs were loaded,
+save summary statistics to a dataframe and output as a CSV. For files where no SNPs were
+loaded, save a message for each file indicating the issue and optionally extract these
+files from the datadump for debugging.
+"""
+
+import logging
+import os
+import random
+
+from atomicwrites import atomic_write
+import pandas as pd
+
+from snps import SNPs
+from snps.resources import Resources
+from snps.utils import Parallelizer, save_df_as_csv, create_dir, clean_str
+
+OUTPUT_DIR = "output"
+EXTRACT_FILES = True
+
+# create output directory for this example
+create_dir(OUTPUT_DIR)
+
+# assume script is being run from examples dir
+r = Resources(resources_dir="../../resources")
+
+# setup logger to output to file in output directory
+logging.basicConfig(
+    filename="{}".format(os.path.join(OUTPUT_DIR, "parse-opensnp-files.txt")),
+    format="%(asctime)s: %(message)s",
+    filemode="w",
+    level=logging.INFO,
+)
+
+logger = logging.getLogger()
+
+
+def load_file(task):
+    file = task["file"]
+
+    try:
+        s = SNPs(r.load_opensnp_datadump_file(file), assign_par_snps=False)
+    except Exception as err:
+        return {"msg": str(err).strip()[:100], "file": file}
+
+    if s.snp_count != 0:
+        d = s.get_summary()
+        d.update({"file": file})
+        return d
+    else:
+        return {"msg": "no SNPs processed", "file": file}
+
+
+def main():
+    logger.info("start")
+
+    # get filenames from openSNP data dump
+    filenames = r.get_opensnp_datadump_filenames()
+
+    filenames = [
+        filename
+        for filename in filenames
+        if "readme" not in filename and "phenotype" not in filename
+    ]
+
+    # draw a sample from the observations
+    random.seed(1)
+    SAMPLE_SIZE = len(filenames)
+    # SAMPLE_SIZE = 10
+    samples = random.sample(range(len(filenames)), SAMPLE_SIZE)
+
+    # setup tasks for parallelizing / execution on multiple cores
+    p = Parallelizer(parallelize=True)
+    tasks = [{"file": filenames[i]} for i in samples]
+
+    # run tasks; results is a list of dicts
+    results = p(load_file, tasks)
+
+    # get results from `load_file` where `snp_count` was non-zero
+    rows = [item for item in results if "msg" not in item]
+
+    df = pd.DataFrame(
+        rows,
+        columns=[
+            "file",
+            "source",
+            "build",
+            "build_detected",
+            "chromosomes",
+            "snp_count",
+        ],
+    )
+
+    save_df_as_csv(df, OUTPUT_DIR, "parse-opensnp-files.csv")
+
+    # log parsing statistics
+    file_count = len(filenames)
+    logger.info("{} files in the openSNP datadump".format(file_count))
+    logger.info("{:.2%} of openSNP datadump files parsed".format(len(df) / file_count))
+    logger.info(
+        "build detected in {:.2%} of files parsed".format(
+            len(df.loc[df.build_detected]) / len(df)
+        )
+    )
+
+    # extract files from the datadump where `load_file` returned a message
+    if EXTRACT_FILES:
+        # group files with same message (e.g., {"some message": ["file1", "file2"], ...})
+        d = {}
+        for result in results:
+            if "msg" in result:
+                if result["msg"] in d:
+                    d[result["msg"]].append(result["file"])
+                else:
+                    d[result["msg"]] = [result["file"]]
+
+        # add messages / file filters as necessary...
+        d["build not detected"] = list(df.loc[~df.build_detected].file.values)
+
+        # extract files that have messages for debugging
+        for msg, files in d.items():
+            if len(files) == 0:
+                continue
+
+            # create a directory for each message (prefix indicates number of files)
+            path = os.path.join(
+                OUTPUT_DIR, "{:04}_{}".format(len(files), clean_str(msg))
+            )
+            create_dir(path)
+            # save each file with message into created directory
+            for filename in files:
+                with atomic_write(os.path.join(path, filename), mode="wb") as f:
+                    f.write(r.load_opensnp_datadump_file(filename))
+
+    logger.info("stop")
+
+
+if __name__ == "__main__":
+    main()