Merge pull request #103 from apriha/develop

v2.0.2
apriha · Oct 7, 2020 · 26d8474 · 26d8474
2 parents 2baeb62 + d2df6b2
commit 26d8474
Show file tree

Hide file tree

Showing 16 changed files with 218 additions and 257 deletions.
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
@@ -87,7 +87,7 @@ For merging, you should:
 
 1. Ensure tests pass.
 2. Update documentation when there's new API, functionality, etc.
-3. Add yourself to ``CONTRIBUTORS.rst``.
+3. Add yourself to ``CONTRIBUTORS.rst`` if you'd like.
 
 Documentation
 -------------

diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst
@@ -31,6 +31,7 @@ Alan Moffet      `@amoffet`_
 Anatoli Babenia  `@abitrolly`_
 Castedo Ellerman `@castedo`_
 Gerard Manning   `@GerardManning`_
+Julian Runnels   `@JulianRunnels`_
 Kevin Arvai      `@arvkevi`_
 Phil Palmer      `@PhilPalmer`_
 Yoan Bouzin
@@ -40,5 +41,6 @@ Yoan Bouzin
 .. _@abitrolly: https://github.com/abitrolly
 .. _@castedo: https://github.com/castedo
 .. _@GerardManning: https://github.com/GerardManning
+.. _@JulianRunnels: https://github.com/JulianRunnels
 .. _@arvkevi: https://github.com/arvkevi
 .. _@PhilPalmer: https://github.com/PhilPalmer
diff --git a/README.rst b/README.rst
@@ -156,12 +156,12 @@ As the data gets added, it's compared to the existing data, and SNP position and
 discrepancies are identified. (The discrepancy thresholds can be tuned via parameters.) These
 discrepant SNPs are available for inspection after the merge via properties of the ``SNPs`` object.
 
-Additionally, any non-called / null genotypes will be updated during the merge, if the file
-being merged has a called genotype for the SNP.
-
 >>> len(s.discrepant_merge_genotypes)
 151
 
+Additionally, any non-called / null genotypes will be updated during the merge, if the file
+being merged has a called genotype for the SNP.
+
 Finally, ``merge`` returns a list of ``dict``, where each ``dict`` has information corresponding
 to the results of each merge (e.g., SNPs in common).
 

diff --git a/analysis/parse-opensnp-files/parse_opensnp_files.py b/analysis/parse-opensnp-files/parse_opensnp_files.py
@@ -28,7 +28,7 @@
 
 # setup logger to output to file in output directory
 logging.basicConfig(
-    filename="{}".format(os.path.join(OUTPUT_DIR, "parse-opensnp-files.txt")),
+    filename=f'{os.path.join(OUTPUT_DIR, "parse-opensnp-files.txt")}',
     format="%(asctime)s: %(message)s",
     filemode="w",
     level=logging.INFO,
@@ -90,12 +90,10 @@ def main():
 
     # log parsing statistics
     file_count = len(filenames)
-    logger.info("{} files in the openSNP datadump".format(file_count))
-    logger.info("{:.2%} of openSNP datadump files parsed".format(len(df) / file_count))
+    logger.info(f"{file_count} files in the openSNP datadump")
+    logger.info(f"{(len(df) / file_count):.2%} of openSNP datadump files parsed")
     logger.info(
-        "build detected in {:.2%} of files parsed".format(
-            len(df.loc[df.build_detected]) / len(df)
-        )
+        f"build detected in {len(df.loc[df.build_detected]) / len(df):.2%} of files parsed"
     )
 
     # extract files from the datadump where `load_file` returned a message
@@ -118,9 +116,7 @@ def main():
                 continue
 
             # create a directory for each message (prefix indicates number of files)
-            path = os.path.join(
-                OUTPUT_DIR, "{:04}_{}".format(len(files), clean_str(msg))
-            )
+            path = os.path.join(OUTPUT_DIR, f"{len(files):04}_{clean_str(msg)}")
             create_dir(path)
             # save each file with message into created directory
             for filename in files:

diff --git a/analysis/xy-chrom-snp-ratios/xy-chrom-snp-ratios.py b/analysis/xy-chrom-snp-ratios/xy-chrom-snp-ratios.py
@@ -22,7 +22,7 @@
 
 # setup logger to output to file in output directory
 logging.basicConfig(
-    filename="{}".format(os.path.join(OUTPUT_DIR, "xy-chrom-snp-ratios.txt")),
+    filename=f"{os.path.join(OUTPUT_DIR, 'xy-chrom-snp-ratios.txt')}",
     format="%(asctime)s#%(message)s",
     filemode="w",
     level=logging.INFO,
@@ -35,10 +35,10 @@ def get_xy_chrom_snp_ratios(task):
     file = task["file"]
 
     try:
-        logger.info("loading {}".format(file))
+        logger.info(f"loading {file}")
         s = SNPs(r.load_opensnp_datadump_file(file), assign_par_snps=False)
     except Exception as err:
-        logger.error("{}#{}".format(file, err))
+        logger.error(f"{file}#{err}")
         return None
 
     try:
@@ -72,10 +72,10 @@ def get_xy_chrom_snp_ratios(task):
                 s.count,
             ]
         else:
-            logger.info("{}#{}".format(file, "no SNPs processed"))
+            logger.info(f"{file}#{'no SNPs processed'}")
 
     except Exception as err:
-        logger.error("{}#{}".format(file, err))
+        logger.error(f"{file}#{err}")
         return None
 
 
@@ -99,9 +99,7 @@ def create_analysis_plot(
 
     # start with a rectangular Figure
     fig = plt.figure(figsize=(8, 8))
-    fig.suptitle(
-        "Analysis of openSNP datadump XY chrom SNP ratios; N = {}".format(len(df))
-    )
+    fig.suptitle(f"Analysis of openSNP datadump XY chrom SNP ratios; N = {len(df)}")
 
     ax_scatter = plt.axes(rect_scatter)
     ax_scatter.tick_params(direction="in", top=True, right=True)
@@ -127,12 +125,12 @@ def create_analysis_plot(
     heterozygous_x_snps_threshold_line = ax_scatter.axvline(
         x=heterozygous_x_snps_threshold,
         c="blue",
-        label="Het. X threshold={}".format(heterozygous_x_snps_threshold),
+        label=f"Het. X threshold={heterozygous_x_snps_threshold}",
     )
     y_snps_not_null_threshold_line = ax_scatter.axhline(
         y=y_snps_not_null_threshold,
         c="red",
-        label="Y not null threshold={}".format(y_snps_not_null_threshold),
+        label=f"Y not null threshold={y_snps_not_null_threshold}",
     )
 
     # fill genotype areas
@@ -178,17 +176,7 @@ def create_analysis_plot(
     x_offset = lim_x * 0.01
     y_offset = lim_y * 0.01
     ax_scatter.annotate(
-        "n={}".format(
-            len(
-                df_ratios.loc[
-                    (
-                        df_ratios.heterozygous_x_snps_ratio
-                        < heterozygous_x_snps_threshold
-                    )
-                    & (df_ratios.y_snps_not_null_ratio < y_snps_not_null_threshold)
-                ]
-            )
-        ),
+        f"n={len(df_ratios.loc[(df_ratios.heterozygous_x_snps_ratio < heterozygous_x_snps_threshold) & (df_ratios.y_snps_not_null_ratio < y_snps_not_null_threshold)])}",
         (
             heterozygous_x_snps_threshold - x_offset,
             y_snps_not_null_threshold - y_offset,
@@ -197,17 +185,7 @@ def create_analysis_plot(
         va="top",
     )
     ax_scatter.annotate(
-        "n={}".format(
-            len(
-                df_ratios.loc[
-                    (
-                        df_ratios.heterozygous_x_snps_ratio
-                        < heterozygous_x_snps_threshold
-                    )
-                    & (df_ratios.y_snps_not_null_ratio >= y_snps_not_null_threshold)
-                ]
-            )
-        ),
+        f"n={ len(df_ratios.loc[(df_ratios.heterozygous_x_snps_ratio < heterozygous_x_snps_threshold)& (df_ratios.y_snps_not_null_ratio >= y_snps_not_null_threshold)])}",
         (
             heterozygous_x_snps_threshold - x_offset,
             y_snps_not_null_threshold + y_offset,
@@ -216,17 +194,7 @@ def create_analysis_plot(
         va="bottom",
     )
     ax_scatter.annotate(
-        "n={}".format(
-            len(
-                df_ratios.loc[
-                    (
-                        df_ratios.heterozygous_x_snps_ratio
-                        >= heterozygous_x_snps_threshold
-                    )
-                    & (df_ratios.y_snps_not_null_ratio >= y_snps_not_null_threshold)
-                ]
-            )
-        ),
+        f"n={len(df_ratios.loc[ (df_ratios.heterozygous_x_snps_ratio >= heterozygous_x_snps_threshold) & (df_ratios.y_snps_not_null_ratio >= y_snps_not_null_threshold)])}",
         (
             heterozygous_x_snps_threshold + x_offset,
             y_snps_not_null_threshold + y_offset,
@@ -235,17 +203,7 @@ def create_analysis_plot(
         va="bottom",
     )
     ax_scatter.annotate(
-        "n={}".format(
-            len(
-                df_ratios.loc[
-                    (
-                        df_ratios.heterozygous_x_snps_ratio
-                        >= heterozygous_x_snps_threshold
-                    )
-                    & (df_ratios.y_snps_not_null_ratio < y_snps_not_null_threshold)
-                ]
-            )
-        ),
+        f"n={len(df_ratios.loc[(df_ratios.heterozygous_x_snps_ratio >= heterozygous_x_snps_threshold) & (df_ratios.y_snps_not_null_ratio < y_snps_not_null_threshold)])}",
         (
             heterozygous_x_snps_threshold + x_offset,
             y_snps_not_null_threshold - y_offset,
@@ -307,7 +265,7 @@ def create_analysis_plot(
 
     # save output
     with atomic_write(
-        "{}".format(os.path.join(OUTPUT_DIR, "xy-chrom-snp-ratios.png")),
+        f"{os.path.join(OUTPUT_DIR, 'xy-chrom-snp-ratios.png')}",
         mode="wb",
         overwrite=True,
     ) as f:

diff --git a/docs/index.rst b/docs/index.rst
@@ -14,6 +14,7 @@
 
    README <readme>
    output_files
+   installation
    snps_banner
    changelog
    contributing

diff --git a/docs/installation.rst b/docs/installation.rst
@@ -0,0 +1,83 @@
+Installation
+============
+
+``snps`` is `available <https://pypi.org/project/snps/>`_ on the
+`Python Package Index <https://pypi.org>`_. Install ``snps`` (and its required
+Python dependencies) via ``pip``::
+
+    $ pip install snps
+
+Installation and Usage on a Raspberry Pi
+----------------------------------------
+The instructions below provide the steps to install ``snps`` on a
+`Raspberry Pi <https://www.raspberrypi.org>`_ (tested with
+"`Raspberry Pi OS <https://www.raspberrypi.org/downloads/raspberry-pi-os/>`_ (32-bit) Lite",
+release date 2020-08-20). For more details about Python on the Raspberry Pi, see
+`here <https://www.raspberrypi.org/documentation/linux/software/python.md>`_.
+
+.. note:: Text after a prompt (e.g., ``$``) is the command to type at the command line. The
+          instructions assume a fresh install of Raspberry Pi OS and that after logging in as
+          the ``pi`` user, the current working directory is ``/home/pi``.
+
+1. Install ``pip`` for Python 3::
+
+    pi@raspberrypi:~ $ sudo apt install python3-pip
+
+   Press "y" followed by "enter" to continue. This enables us to install packages from the
+   Python Package Index.
+
+2. Install the ``venv`` module::
+
+    pi@raspberrypi:~ $ sudo apt install python3-venv
+
+   Press "y" followed by "enter" to continue. This enables us to create a
+   `virtual environment <https://docs.python.org/3/library/venv.html>`_ to isolate the ``snps``
+   installation from other system Python packages.
+
+3. `Install ATLAS <https://github.com/Kitt-AI/snowboy/issues/262#issuecomment-324997127>`_::
+
+    pi@raspberrypi:~ $ sudo apt install libatlas-base-dev
+
+   Press "y" followed by "enter" to continue. This is required for `NumPy <https://numpy.org>`_, a
+   dependency of ``snps``.
+
+4. Create a directory for ``snps`` and change working directory::
+
+    pi@raspberrypi:~ $ mkdir snps
+    pi@raspberrypi:~ $ cd snps
+
+5. Create a virtual environment for ``snps``::
+
+    pi@raspberrypi:~/snps $ python3 -m venv .venv
+
+   The virtual environment is located at ``/home/pi/snps/.venv``.
+
+6. Activate the virtual environment::
+
+    pi@raspberrypi:~/snps $ source .venv/bin/activate
+
+   Now when you invoke Python or ``pip``, the virtual environment's version will be used (as
+   indicated by the ``(.venv)`` before the prompt). This can be verified as follows::
+
+    (.venv) pi@raspberrypi:~/snps $ which python
+    /home/pi/snps/.venv/bin/python
+
+7. Install ``snps``::
+
+    (.venv) pi@raspberrypi:~/snps $ pip install snps
+
+8. Start Python::
+
+    (.venv) pi@raspberrypi:~/snps $ python
+    Python 3.7.3 (default, Jul 25 2020, 13:03:44)
+    [GCC 8.3.0] on linux
+    Type "help", "copyright", "credits" or "license" for more information.
+    >>>
+
+9. Use ``snps``; examples shown in the README should now work.
+
+10. At completion of usage, the virtual environment can be deactivated::
+
+     (.venv) pi@raspberrypi:~/snps $ deactivate
+     pi@raspberrypi:~/snps $
+
diff --git a/src/snps/ensembl.py b/src/snps/ensembl.py
@@ -116,9 +116,7 @@ def perform_rest_action(self, endpoint, hdrs=None, params=None):
                     self.perform_rest_action(endpoint, hdrs, params)
             else:
                 sys.stderr.write(
-                    "Request failed for {0}: Status code: {1.code} Reason: {1.reason}\n".format(
-                        endpoint, e
-                    )
+                    f"Request failed for {endpoint}: Status code: {e.code} Reason: {e.reason}\n"
                 )
 
         return data
diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py
@@ -268,11 +268,7 @@ def _detect_build_from_comments(self, comments, source):
         # allow more variations for VCF
         if source == "vcf":
             if "https://pypi.org/project/snps/" in comments:  # remove `snps` version
-                comments = "{}{}".format(
-                    comments[: comments.find("snps v")],
-                    comments[comments.find("https://pypi.org/project/snps/") :],
-                )
-
+                comments = f"{comments[: comments.find('snps v')]}{comments[comments.find('https://pypi.org/project/snps/'):]}"
             if "hg19" in comments:
                 return 37
             elif "ncbi36" in comments:
@@ -827,9 +823,7 @@ def map_pos(x):
             df["rsid"] = df["SNP Name"].apply(map_rsids)
             df["chrom"] = df["SNP Name"].apply(map_chr)
             df["pos"] = df["SNP Name"].apply(map_pos)
-            df["genotype"] = (
-                df["Allele1 - {}".format(strand)] + df["Allele2 - {}".format(strand)]
-            )
+            df["genotype"] = df[f"Allele1 - {strand}"] + df[f"Allele2 - {strand}"]
             df.dropna(subset=["rsid", "chrom", "pos"], inplace=True)
 
             df = df.astype(NORMALIZED_DTYPES)
@@ -1183,7 +1177,7 @@ def _parse_vcf(self, buffer, rsids):
 
                 record_array = [
                     rsid,
-                    "{}".format(line_split[0]).strip("chr"),
+                    f"{line_split[0]}".strip("chr"),
                     line_split[1],
                     genotype,
                 ]