Skip to content

Commit

Permalink
Merge pull request #103 from apriha/develop
Browse files Browse the repository at this point in the history
v2.0.2
  • Loading branch information
apriha committed Oct 7, 2020
2 parents 2baeb62 + d2df6b2 commit 26d8474
Show file tree
Hide file tree
Showing 16 changed files with 218 additions and 257 deletions.
2 changes: 1 addition & 1 deletion CONTRIBUTING.rst
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ For merging, you should:

1. Ensure tests pass.
2. Update documentation when there's new API, functionality, etc.
3. Add yourself to ``CONTRIBUTORS.rst``.
3. Add yourself to ``CONTRIBUTORS.rst`` if you'd like.

Documentation
-------------
Expand Down
2 changes: 2 additions & 0 deletions CONTRIBUTORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ Alan Moffet `@amoffet`_
Anatoli Babenia `@abitrolly`_
Castedo Ellerman `@castedo`_
Gerard Manning `@GerardManning`_
Julian Runnels `@JulianRunnels`_
Kevin Arvai `@arvkevi`_
Phil Palmer `@PhilPalmer`_
Yoan Bouzin
Expand All @@ -40,5 +41,6 @@ Yoan Bouzin
.. _@abitrolly: https://github.com/abitrolly
.. _@castedo: https://github.com/castedo
.. _@GerardManning: https://github.com/GerardManning
.. _@JulianRunnels: https://github.com/JulianRunnels
.. _@arvkevi: https://github.com/arvkevi
.. _@PhilPalmer: https://github.com/PhilPalmer
6 changes: 3 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -156,12 +156,12 @@ As the data gets added, it's compared to the existing data, and SNP position and
discrepancies are identified. (The discrepancy thresholds can be tuned via parameters.) These
discrepant SNPs are available for inspection after the merge via properties of the ``SNPs`` object.

Additionally, any non-called / null genotypes will be updated during the merge, if the file
being merged has a called genotype for the SNP.

>>> len(s.discrepant_merge_genotypes)
151

Additionally, any non-called / null genotypes will be updated during the merge, if the file
being merged has a called genotype for the SNP.

Finally, ``merge`` returns a list of ``dict``, where each ``dict`` has information corresponding
to the results of each merge (e.g., SNPs in common).

Expand Down
14 changes: 5 additions & 9 deletions analysis/parse-opensnp-files/parse_opensnp_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

# setup logger to output to file in output directory
logging.basicConfig(
filename="{}".format(os.path.join(OUTPUT_DIR, "parse-opensnp-files.txt")),
filename=f'{os.path.join(OUTPUT_DIR, "parse-opensnp-files.txt")}',
format="%(asctime)s: %(message)s",
filemode="w",
level=logging.INFO,
Expand Down Expand Up @@ -90,12 +90,10 @@ def main():

# log parsing statistics
file_count = len(filenames)
logger.info("{} files in the openSNP datadump".format(file_count))
logger.info("{:.2%} of openSNP datadump files parsed".format(len(df) / file_count))
logger.info(f"{file_count} files in the openSNP datadump")
logger.info(f"{(len(df) / file_count):.2%} of openSNP datadump files parsed")
logger.info(
"build detected in {:.2%} of files parsed".format(
len(df.loc[df.build_detected]) / len(df)
)
f"build detected in {len(df.loc[df.build_detected]) / len(df):.2%} of files parsed"
)

# extract files from the datadump where `load_file` returned a message
Expand All @@ -118,9 +116,7 @@ def main():
continue

# create a directory for each message (prefix indicates number of files)
path = os.path.join(
OUTPUT_DIR, "{:04}_{}".format(len(files), clean_str(msg))
)
path = os.path.join(OUTPUT_DIR, f"{len(files):04}_{clean_str(msg)}")
create_dir(path)
# save each file with message into created directory
for filename in files:
Expand Down
68 changes: 13 additions & 55 deletions analysis/xy-chrom-snp-ratios/xy-chrom-snp-ratios.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

# setup logger to output to file in output directory
logging.basicConfig(
filename="{}".format(os.path.join(OUTPUT_DIR, "xy-chrom-snp-ratios.txt")),
filename=f"{os.path.join(OUTPUT_DIR, 'xy-chrom-snp-ratios.txt')}",
format="%(asctime)s#%(message)s",
filemode="w",
level=logging.INFO,
Expand All @@ -35,10 +35,10 @@ def get_xy_chrom_snp_ratios(task):
file = task["file"]

try:
logger.info("loading {}".format(file))
logger.info(f"loading {file}")
s = SNPs(r.load_opensnp_datadump_file(file), assign_par_snps=False)
except Exception as err:
logger.error("{}#{}".format(file, err))
logger.error(f"{file}#{err}")
return None

try:
Expand Down Expand Up @@ -72,10 +72,10 @@ def get_xy_chrom_snp_ratios(task):
s.count,
]
else:
logger.info("{}#{}".format(file, "no SNPs processed"))
logger.info(f"{file}#{'no SNPs processed'}")

except Exception as err:
logger.error("{}#{}".format(file, err))
logger.error(f"{file}#{err}")
return None


Expand All @@ -99,9 +99,7 @@ def create_analysis_plot(

# start with a rectangular Figure
fig = plt.figure(figsize=(8, 8))
fig.suptitle(
"Analysis of openSNP datadump XY chrom SNP ratios; N = {}".format(len(df))
)
fig.suptitle(f"Analysis of openSNP datadump XY chrom SNP ratios; N = {len(df)}")

ax_scatter = plt.axes(rect_scatter)
ax_scatter.tick_params(direction="in", top=True, right=True)
Expand All @@ -127,12 +125,12 @@ def create_analysis_plot(
heterozygous_x_snps_threshold_line = ax_scatter.axvline(
x=heterozygous_x_snps_threshold,
c="blue",
label="Het. X threshold={}".format(heterozygous_x_snps_threshold),
label=f"Het. X threshold={heterozygous_x_snps_threshold}",
)
y_snps_not_null_threshold_line = ax_scatter.axhline(
y=y_snps_not_null_threshold,
c="red",
label="Y not null threshold={}".format(y_snps_not_null_threshold),
label=f"Y not null threshold={y_snps_not_null_threshold}",
)

# fill genotype areas
Expand Down Expand Up @@ -178,17 +176,7 @@ def create_analysis_plot(
x_offset = lim_x * 0.01
y_offset = lim_y * 0.01
ax_scatter.annotate(
"n={}".format(
len(
df_ratios.loc[
(
df_ratios.heterozygous_x_snps_ratio
< heterozygous_x_snps_threshold
)
& (df_ratios.y_snps_not_null_ratio < y_snps_not_null_threshold)
]
)
),
f"n={len(df_ratios.loc[(df_ratios.heterozygous_x_snps_ratio < heterozygous_x_snps_threshold) & (df_ratios.y_snps_not_null_ratio < y_snps_not_null_threshold)])}",
(
heterozygous_x_snps_threshold - x_offset,
y_snps_not_null_threshold - y_offset,
Expand All @@ -197,17 +185,7 @@ def create_analysis_plot(
va="top",
)
ax_scatter.annotate(
"n={}".format(
len(
df_ratios.loc[
(
df_ratios.heterozygous_x_snps_ratio
< heterozygous_x_snps_threshold
)
& (df_ratios.y_snps_not_null_ratio >= y_snps_not_null_threshold)
]
)
),
f"n={ len(df_ratios.loc[(df_ratios.heterozygous_x_snps_ratio < heterozygous_x_snps_threshold)& (df_ratios.y_snps_not_null_ratio >= y_snps_not_null_threshold)])}",
(
heterozygous_x_snps_threshold - x_offset,
y_snps_not_null_threshold + y_offset,
Expand All @@ -216,17 +194,7 @@ def create_analysis_plot(
va="bottom",
)
ax_scatter.annotate(
"n={}".format(
len(
df_ratios.loc[
(
df_ratios.heterozygous_x_snps_ratio
>= heterozygous_x_snps_threshold
)
& (df_ratios.y_snps_not_null_ratio >= y_snps_not_null_threshold)
]
)
),
f"n={len(df_ratios.loc[ (df_ratios.heterozygous_x_snps_ratio >= heterozygous_x_snps_threshold) & (df_ratios.y_snps_not_null_ratio >= y_snps_not_null_threshold)])}",
(
heterozygous_x_snps_threshold + x_offset,
y_snps_not_null_threshold + y_offset,
Expand All @@ -235,17 +203,7 @@ def create_analysis_plot(
va="bottom",
)
ax_scatter.annotate(
"n={}".format(
len(
df_ratios.loc[
(
df_ratios.heterozygous_x_snps_ratio
>= heterozygous_x_snps_threshold
)
& (df_ratios.y_snps_not_null_ratio < y_snps_not_null_threshold)
]
)
),
f"n={len(df_ratios.loc[(df_ratios.heterozygous_x_snps_ratio >= heterozygous_x_snps_threshold) & (df_ratios.y_snps_not_null_ratio < y_snps_not_null_threshold)])}",
(
heterozygous_x_snps_threshold + x_offset,
y_snps_not_null_threshold - y_offset,
Expand Down Expand Up @@ -307,7 +265,7 @@ def create_analysis_plot(

# save output
with atomic_write(
"{}".format(os.path.join(OUTPUT_DIR, "xy-chrom-snp-ratios.png")),
f"{os.path.join(OUTPUT_DIR, 'xy-chrom-snp-ratios.png')}",
mode="wb",
overwrite=True,
) as f:
Expand Down
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

README <readme>
output_files
installation
snps_banner
changelog
contributing
Expand Down
83 changes: 83 additions & 0 deletions docs/installation.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
Installation
============

``snps`` is `available <https://pypi.org/project/snps/>`_ on the
`Python Package Index <https://pypi.org>`_. Install ``snps`` (and its required
Python dependencies) via ``pip``::

$ pip install snps

Installation and Usage on a Raspberry Pi
----------------------------------------
The instructions below provide the steps to install ``snps`` on a
`Raspberry Pi <https://www.raspberrypi.org>`_ (tested with
"`Raspberry Pi OS <https://www.raspberrypi.org/downloads/raspberry-pi-os/>`_ (32-bit) Lite",
release date 2020-08-20). For more details about Python on the Raspberry Pi, see
`here <https://www.raspberrypi.org/documentation/linux/software/python.md>`_.

.. note:: Text after a prompt (e.g., ``$``) is the command to type at the command line. The
instructions assume a fresh install of Raspberry Pi OS and that after logging in as
the ``pi`` user, the current working directory is ``/home/pi``.

1. Install ``pip`` for Python 3::

pi@raspberrypi:~ $ sudo apt install python3-pip

Press "y" followed by "enter" to continue. This enables us to install packages from the
Python Package Index.

2. Install the ``venv`` module::

pi@raspberrypi:~ $ sudo apt install python3-venv

Press "y" followed by "enter" to continue. This enables us to create a
`virtual environment <https://docs.python.org/3/library/venv.html>`_ to isolate the ``snps``
installation from other system Python packages.

3. `Install ATLAS <https://github.com/Kitt-AI/snowboy/issues/262#issuecomment-324997127>`_::

pi@raspberrypi:~ $ sudo apt install libatlas-base-dev

Press "y" followed by "enter" to continue. This is required for `NumPy <https://numpy.org>`_, a
dependency of ``snps``.

4. Create a directory for ``snps`` and change working directory::

pi@raspberrypi:~ $ mkdir snps
pi@raspberrypi:~ $ cd snps

5. Create a virtual environment for ``snps``::

pi@raspberrypi:~/snps $ python3 -m venv .venv

The virtual environment is located at ``/home/pi/snps/.venv``.

6. Activate the virtual environment::

pi@raspberrypi:~/snps $ source .venv/bin/activate

Now when you invoke Python or ``pip``, the virtual environment's version will be used (as
indicated by the ``(.venv)`` before the prompt). This can be verified as follows::

(.venv) pi@raspberrypi:~/snps $ which python
/home/pi/snps/.venv/bin/python

7. Install ``snps``::

(.venv) pi@raspberrypi:~/snps $ pip install snps

8. Start Python::

(.venv) pi@raspberrypi:~/snps $ python
Python 3.7.3 (default, Jul 25 2020, 13:03:44)
[GCC 8.3.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>>

9. Use ``snps``; examples shown in the README should now work.

10. At completion of usage, the virtual environment can be deactivated::

(.venv) pi@raspberrypi:~/snps $ deactivate
pi@raspberrypi:~/snps $

4 changes: 1 addition & 3 deletions src/snps/ensembl.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,7 @@ def perform_rest_action(self, endpoint, hdrs=None, params=None):
self.perform_rest_action(endpoint, hdrs, params)
else:
sys.stderr.write(
"Request failed for {0}: Status code: {1.code} Reason: {1.reason}\n".format(
endpoint, e
)
f"Request failed for {endpoint}: Status code: {e.code} Reason: {e.reason}\n"
)

return data
12 changes: 3 additions & 9 deletions src/snps/io/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,11 +268,7 @@ def _detect_build_from_comments(self, comments, source):
# allow more variations for VCF
if source == "vcf":
if "https://pypi.org/project/snps/" in comments: # remove `snps` version
comments = "{}{}".format(
comments[: comments.find("snps v")],
comments[comments.find("https://pypi.org/project/snps/") :],
)

comments = f"{comments[: comments.find('snps v')]}{comments[comments.find('https://pypi.org/project/snps/'):]}"
if "hg19" in comments:
return 37
elif "ncbi36" in comments:
Expand Down Expand Up @@ -827,9 +823,7 @@ def map_pos(x):
df["rsid"] = df["SNP Name"].apply(map_rsids)
df["chrom"] = df["SNP Name"].apply(map_chr)
df["pos"] = df["SNP Name"].apply(map_pos)
df["genotype"] = (
df["Allele1 - {}".format(strand)] + df["Allele2 - {}".format(strand)]
)
df["genotype"] = df[f"Allele1 - {strand}"] + df[f"Allele2 - {strand}"]
df.dropna(subset=["rsid", "chrom", "pos"], inplace=True)

df = df.astype(NORMALIZED_DTYPES)
Expand Down Expand Up @@ -1183,7 +1177,7 @@ def _parse_vcf(self, buffer, rsids):

record_array = [
rsid,
"{}".format(line_split[0]).strip("chr"),
f"{line_split[0]}".strip("chr"),
line_split[1],
genotype,
]
Expand Down
Loading

0 comments on commit 26d8474

Please sign in to comment.