Merge pull request #76 from apriha/develop

v1.2.0
apriha · Jun 5, 2020 · 3ef1699 · 3ef1699
2 parents 1ed0a1f + 5a1a6bf
commit 3ef1699
Show file tree

Hide file tree

Showing 34 changed files with 4,454 additions and 2,983 deletions.
diff --git a/.gitignore b/.gitignore
@@ -115,7 +115,6 @@ tests/resources/*
 !tests/resources/gsa_rsid_map.txt
 tests/input/23andme.txt.zip
 tests/input/discrepant_snps[12].csv
-tests/input/empty.txt
 tests/input/ftdna.csv.gz
 tests/input/generic.fa.gz
 tests/input/testvcf.vcf.gz
diff --git a/.travis.yml b/.travis.yml
@@ -9,35 +9,26 @@ language: python
 
 before_install:
   - pip install --upgrade pip setuptools wheel
-  - pip install pytest-cov codecov awscli
+  - pip install pytest-cov codecov
 
 install:
   - pip install .
 
 script:
-  # for testing, limit downloads from the resource servers by using cached resources;
-  # note that the master branch is tested weekly via `cron`, so this ensures all Python
-  # versions will be periodically integration tested with the resource servers
+  # for testing, limit downloads from the resource servers to only the selected job for
+  # PRs and the master branch; note that the master branch is tested weekly via `cron`,
+  # so this ensures all Python versions will be periodically integration tested with the
+  # resource servers
   - set -e
   - NUM_JOBS=4
   - SELECTED_JOB=$((10#$(date +%V) % $NUM_JOBS))  # identify a job based on week of the year
-  - DOWNLOADS_ENABLED=false
   - |
-    if [[ $TRAVIS_PULL_REQUEST != "false" ]]; then
-      # download resources for all jobs on a pull request
-      DOWNLOADS_ENABLED=true
+    if [[ $TRAVIS_PULL_REQUEST != "false" && $SELECTED_JOB == $JOB_ID ]]; then
+      # download resources for selected job on a pull request
+      export DOWNLOADS_ENABLED=true
     elif [[ $TRAVIS_BRANCH == "master" && $SELECTED_JOB == $JOB_ID ]]; then
       # download resources for selected job on master branch
-      DOWNLOADS_ENABLED=true
-    fi
-  - |
-    if [[ $DOWNLOADS_ENABLED == "false" ]]; then
-      # use cached resources on Amazon S3
-      aws s3 cp s3://snps-resources/resources.tar.gz resources.tar.gz
-      if [[ -f resources.tar.gz ]]; then
-        tar -xzf resources.tar.gz
-        rm resources.tar.gz
-      fi
+      export DOWNLOADS_ENABLED=true
     fi
   - pytest --cov=snps tests
   - |

diff --git a/analysis/parse-opensnp-files/README.rst b/analysis/parse-opensnp-files/README.rst
@@ -0,0 +1,15 @@
+parse-opensnp-files
+===================
+scripts to load and debug parsing of openSNP datadump files
+
+Method
+------
+Attempt to parse each file in the `openSNP <https://opensnp.org>`_ datadump by creating a
+``SNPs`` object. For files where SNPs were loaded, save summary statistics to a dataframe and
+output as a CSV. For files where no SNPs were loaded, save a message for each file indicating
+the issue and optionally extract these files from the datadump for debugging.
+
+Results
+-------
+As of May 2020, ``snps`` can parse ~96.6% of the genotype files in the datadump. Additionally,
+``snps`` can detect the build in ~99.9% of those files.
diff --git a/analysis/parse-opensnp-files/get_file.py b/analysis/parse-opensnp-files/get_file.py
@@ -0,0 +1,21 @@
+""" Get a file from the openSNP datadump for debugging. """
+
+import os
+
+from atomicwrites import atomic_write
+
+from snps.resources import Resources
+from snps.utils import create_dir
+
+OUTPUT_DIR = "output"
+FILE = "user662_file340_yearofbirth_unknown_sex_unknown.23andme.txt"
+
+if __name__ == "__main__":
+    # create output directory for this example
+    create_dir(OUTPUT_DIR)
+
+    # assume script is being run from examples dir
+    r = Resources(resources_dir="../../resources")
+
+    with atomic_write(os.path.join(OUTPUT_DIR, FILE), mode="wb") as f:
+        f.write(r.load_opensnp_datadump_file(FILE))
diff --git a/analysis/parse-opensnp-files/parse_opensnp_files.py b/analysis/parse-opensnp-files/parse_opensnp_files.py
@@ -0,0 +1,141 @@
+""" Parse openSNP datadump files.
+
+Attempt to parse each file in the openSNP datadump. For files where SNPs were loaded,
+save summary statistics to a dataframe and output as a CSV. For files where no SNPs were
+loaded, save a message for each file indicating the issue and optionally extract these
+files from the datadump for debugging.
+"""
+
+import logging
+import os
+import random
+
+from atomicwrites import atomic_write
+import pandas as pd
+
+from snps import SNPs
+from snps.resources import Resources
+from snps.utils import Parallelizer, save_df_as_csv, create_dir, clean_str
+
+OUTPUT_DIR = "output"
+EXTRACT_FILES = True
+
+# create output directory for this example
+create_dir(OUTPUT_DIR)
+
+# assume script is being run from examples dir
+r = Resources(resources_dir="../../resources")
+
+# setup logger to output to file in output directory
+logging.basicConfig(
+    filename="{}".format(os.path.join(OUTPUT_DIR, "parse-opensnp-files.txt")),
+    format="%(asctime)s: %(message)s",
+    filemode="w",
+    level=logging.INFO,
+)
+
+logger = logging.getLogger()
+
+
+def load_file(task):
+    file = task["file"]
+
+    try:
+        s = SNPs(r.load_opensnp_datadump_file(file), assign_par_snps=False)
+    except Exception as err:
+        return {"msg": str(err).strip()[:100], "file": file}
+
+    if s.snp_count != 0:
+        d = s.get_summary()
+        d.update({"file": file})
+        return d
+    else:
+        return {"msg": "no SNPs processed", "file": file}
+
+
+def main():
+    logger.info("start")
+
+    # get filenames from openSNP data dump
+    filenames = r.get_opensnp_datadump_filenames()
+
+    filenames = [
+        filename
+        for filename in filenames
+        if "readme" not in filename and "phenotype" not in filename
+    ]
+
+    # draw a sample from the observations
+    random.seed(1)
+    SAMPLE_SIZE = len(filenames)
+    # SAMPLE_SIZE = 10
+    samples = random.sample(range(len(filenames)), SAMPLE_SIZE)
+
+    # setup tasks for parallelizing / execution on multiple cores
+    p = Parallelizer(parallelize=True)
+    tasks = [{"file": filenames[i]} for i in samples]
+
+    # run tasks; results is a list of dicts
+    results = p(load_file, tasks)
+
+    # get results from `load_file` where `snp_count` was non-zero
+    rows = [item for item in results if "msg" not in item]
+
+    df = pd.DataFrame(
+        rows,
+        columns=[
+            "file",
+            "source",
+            "build",
+            "build_detected",
+            "chromosomes",
+            "snp_count",
+        ],
+    )
+
+    save_df_as_csv(df, OUTPUT_DIR, "parse-opensnp-files.csv")
+
+    # log parsing statistics
+    file_count = len(filenames)
+    logger.info("{} files in the openSNP datadump".format(file_count))
+    logger.info("{:.2%} of openSNP datadump files parsed".format(len(df) / file_count))
+    logger.info(
+        "build detected in {:.2%} of files parsed".format(
+            len(df.loc[df.build_detected]) / len(df)
+        )
+    )
+
+    # extract files from the datadump where `load_file` returned a message
+    if EXTRACT_FILES:
+        # group files with same message (e.g., {"some message": ["file1", "file2"], ...})
+        d = {}
+        for result in results:
+            if "msg" in result:
+                if result["msg"] in d:
+                    d[result["msg"]].append(result["file"])
+                else:
+                    d[result["msg"]] = [result["file"]]
+
+        # add messages / file filters as necessary...
+        d["build not detected"] = list(df.loc[~df.build_detected].file.values)
+
+        # extract files that have messages for debugging
+        for msg, files in d.items():
+            if len(files) == 0:
+                continue
+
+            # create a directory for each message (prefix indicates number of files)
+            path = os.path.join(
+                OUTPUT_DIR, "{:04}_{}".format(len(files), clean_str(msg))
+            )
+            create_dir(path)
+            # save each file with message into created directory
+            for filename in files:
+                with atomic_write(os.path.join(path, filename), mode="wb") as f:
+                    f.write(r.load_opensnp_datadump_file(filename))
+
+    logger.info("stop")
+
+
+if __name__ == "__main__":
+    main()