Skip to content

Commit

Permalink
Merge pull request #161 from apriha/develop
Browse files Browse the repository at this point in the history
v2.6.0
  • Loading branch information
apriha committed Aug 24, 2022
2 parents 431d0c6 + f66c01d commit 6489834
Show file tree
Hide file tree
Showing 13 changed files with 212 additions and 70 deletions.
54 changes: 0 additions & 54 deletions .circleci/config.yml

This file was deleted.

1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ genotype files from the following DNA testing sources:

- `23andMe <https://www.23andme.com>`_
- `Ancestry <https://www.ancestry.com>`_
- `CircleDNA <https://circledna.com/>`_
- `Código 46 <https://codigo46.com.mx>`_
- `DNA.Land <https://dna.land>`_
- `Family Tree DNA <https://www.familytreedna.com>`_
Expand Down
91 changes: 85 additions & 6 deletions src/snps/io/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,22 @@ def read(self):
return d

if "23andMe" in first_line:
d = self.read_23andme(file, compression)
elif "Ancestry" in first_line:
# some 23andMe files have separate alleles
if comments.endswith(
"# rsid\tchromosome\tposition\tallele1\tallele2\n"
) or comments.endswith(
"# rsid\tchromosome\tposition\tallele1\tallele2\r\n"
):
d = self.read_23andme(file, compression, joined=False)
# some 23andMe files have a combined genotype
elif comments.endswith(
"# rsid\tchromosome\tposition\tgenotype\n"
) or comments.endswith("# rsid\tchromosome\tposition\tgenotype\r\n"):
d = self.read_23andme(file, compression, joined=True)
# something we havent seen before and can't handle
else:
return d
elif "AncestryDNA" in first_line:
d = self.read_ancestry(file, compression)
elif first_line.startswith("RSID"):
d = self.read_ftdna(file, compression)
Expand Down Expand Up @@ -186,6 +200,8 @@ def read(self):
elif first_line.startswith("[Header]"):
# Global Screening Array, includes SANO and CODIGO46
d = self.read_gsa(file, compression, comments)
elif "Circle" in first_line:
d = self.read_circledna(file, compression)

# detect build from comments if build was not already detected from `read` method
if not d["build"]:
Expand Down Expand Up @@ -430,7 +446,7 @@ def read_helper(self, source, parser):

return {"snps": df, "source": source, "phased": phased, "build": build}

def read_23andme(self, file, compression):
def read_23andme(self, file, compression, joined=True):
"""Read and parse 23andMe file.
https://www.23andme.com
Expand All @@ -447,15 +463,18 @@ def read_23andme(self, file, compression):
"""

def parser():
if joined:
columnnames = ["rsid", "chrom", "pos", "genotype"]
else:
columnnames = ["rsid", "chrom", "pos", "allele1", "allele2"]
df = pd.read_csv(
file,
comment="#",
sep="\t",
na_values="--",
names=["rsid", "chrom", "pos", "genotype"],
na_values=["--", "-"],
names=columnnames,
compression=compression,
)
df = df.dropna(subset=["rsid", "chrom", "pos"])
# turn number numbers into string numbers
df["chrom"] = df["chrom"].map(
{
Expand Down Expand Up @@ -508,6 +527,12 @@ def parser():
"MT": "MT",
}
)
if not joined:
# stick separate alleles together
df["genotype"] = df["allele1"] + df["allele2"]
del df["allele1"]
del df["allele2"]
df = df.dropna(subset=["rsid", "chrom", "pos"])
df = df.astype(dtype=NORMALIZED_DTYPES)
df = df.set_index("rsid")
return (df,)
Expand Down Expand Up @@ -1116,6 +1141,60 @@ def parser():

return self.read_helper("DNA.Land", parser)

def read_circledna(self, file, compression):
"""Read and parse CircleDNA file.
https://circledna.com/
Notes
-----
This method attempts to read and parse a whole exome file, optionally compressed
with gzip or zip. Some assumptions are made throughout this process:
* SNPs that are not annotated with an RSID are skipped
* Insertions and deletions are skipped
Parameters
----------
file : str or bytes
path to file or bytes to load
Returns
-------
dict
result of `read_helper`
"""

def parser():
rs_chunks = []
with pd.read_csv(
file,
comment="#",
sep="\t",
chunksize=10000,
names=["rsid", "chrom", "pos", "genotype"],
compression=compression,
) as reader:
for chunk in reader:

# filter for SNPs with rsids
tmp = chunk.loc[
(chunk.rsid.str.startswith("rs"))
& (chunk.genotype.str.len() == 3)
]
if len(tmp) > 0:
rs_chunks.append(tmp)

df = pd.concat(rs_chunks)
df.chrom = df.chrom.str[3:]
df.genotype = df.genotype.apply(lambda x: "".join(x.split("/")))
df = df.astype(NORMALIZED_DTYPES)
df.set_index("rsid", inplace=True)

return (df,)

return self.read_helper("CircleDNA", parser)

def read_snps_csv(self, file, comments, compression):
"""Read and parse CSV file generated by ``snps``.
Expand Down
36 changes: 32 additions & 4 deletions src/snps/io/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,15 @@
class Writer:
"""Class for writing SNPs to files."""

def __init__(self, snps=None, filename="", vcf=False, atomic=True, **kwargs):
def __init__(
self,
snps=None,
filename="",
vcf=False,
atomic=True,
vcf_alt_unavailable=".",
**kwargs,
):
"""Initialize a `Writer`.
Parameters
Expand All @@ -63,13 +71,16 @@ def __init__(self, snps=None, filename="", vcf=False, atomic=True, **kwargs):
flag to save file as VCF
atomic : bool
atomically write output to a file on local filesystem
vcf_alt_unavailable : str
representation of VCF ALT allele when ALT is not able to be determined
**kwargs
additional parameters to `pandas.DataFrame.to_csv`
"""
self._snps = snps
self._filename = filename
self._vcf = vcf
self._atomic = atomic
self._vcf_alt_unavailable = vcf_alt_unavailable
self._kwargs = kwargs

def write(self):
Expand All @@ -79,7 +90,15 @@ def write(self):
return (self._write_csv(),)

@classmethod
def write_file(cls, snps=None, filename="", vcf=False, atomic=True, **kwargs):
def write_file(
cls,
snps=None,
filename="",
vcf=False,
atomic=True,
vcf_alt_unavailable=".",
**kwargs,
):
"""Save SNPs to file.
Parameters
Expand All @@ -92,6 +111,8 @@ def write_file(cls, snps=None, filename="", vcf=False, atomic=True, **kwargs):
flag to save file as VCF
atomic : bool
atomically write output to a file on local filesystem
vcf_alt_unavailable : str
representation of VCF ALT allele when ALT is not able to be determined
**kwargs
additional parameters to `pandas.DataFrame.to_csv`
Expand All @@ -102,7 +123,14 @@ def write_file(cls, snps=None, filename="", vcf=False, atomic=True, **kwargs):
discrepant_vcf_position : pd.DataFrame
SNPs with discrepant positions discovered while saving VCF
"""
w = cls(snps=snps, filename=filename, vcf=vcf, atomic=atomic, **kwargs)
w = cls(
snps=snps,
filename=filename,
vcf=vcf,
atomic=atomic,
vcf_alt_unavailable=vcf_alt_unavailable,
**kwargs,
)
return w.write()

def _write_csv(self):
Expand Down Expand Up @@ -365,7 +393,7 @@ def _compute_alt(self, ref, genotype):

if ref in genotype_alleles:
if len(genotype_alleles) == 1:
return "N"
return self._vcf_alt_unavailable
else:
genotype_alleles.remove(ref)
return genotype_alleles.pop(0)
Expand Down
13 changes: 11 additions & 2 deletions src/snps/snps.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,7 +584,9 @@ def valid(self):
else:
return True

def save(self, filename="", vcf=False, atomic=True, **kwargs):
def save(
self, filename="", vcf=False, atomic=True, vcf_alt_unavailable=".", **kwargs
):
"""Save SNPs to file.
Parameters
Expand All @@ -595,6 +597,8 @@ def save(self, filename="", vcf=False, atomic=True, **kwargs):
flag to save file as VCF
atomic : bool
atomically write output to a file on local filesystem
vcf_alt_unavailable : str
representation of VCF ALT allele when ALT is not able to be determined
**kwargs
additional parameters to `pandas.DataFrame.to_csv`
Expand All @@ -612,7 +616,12 @@ def save(self, filename="", vcf=False, atomic=True, **kwargs):
kwargs["sep"] = "\t"

path, *extra = Writer.write_file(
snps=self, filename=filename, vcf=vcf, atomic=atomic, **kwargs
snps=self,
filename=filename,
vcf=vcf,
atomic=atomic,
vcf_alt_unavailable=vcf_alt_unavailable,
**kwargs,
)

if len(extra) == 1 and not extra[0].empty:
Expand Down
2 changes: 2 additions & 0 deletions tests/input/23andme.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@ rs5 1 105 --
rs6 1 106 GC
rs7 1 107 TC
rs8 1 108 AT
rs9 -- 109 AT
rs10 -- -- AT
23 changes: 23 additions & 0 deletions tests/input/23andme_allele.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# 23andMe
#
#
#
#
#
#
#
#
#
#
#
#
#
# rsid chromosome position allele1 allele2
rs1 1 101 A A
rs2 1 102 C C
rs3 1 103 G G
rs4 1 104 T T
rs5 1 105 - -
rs6 1 106 G C
rs7 1 107 T C
rs8 1 108 A T
25 changes: 25 additions & 0 deletions tests/input/23andme_win.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# 23andMe
#
#
#
#
#
#
#
#
#
#
#
#
#
# rsid chromosome position genotype
rs1 1 101 AA
rs2 1 102 CC
rs3 1 103 GG
rs4 1 104 TT
rs5 1 105 --
rs6 1 106 GC
rs7 1 107 TC
rs8 1 108 AT
rs9 -- 109 AT
rs10 -- -- AT
2 changes: 1 addition & 1 deletion tests/input/ancestry.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#Ancestry
#AncestryDNA
#
#
#
Expand Down
2 changes: 1 addition & 1 deletion tests/input/ancestry_mt.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#Ancestry
#AncestryDNA
rsid chromosome position allele1 allele2
rs1 26 101 A A
rs2 26 102 0 0
Expand Down
2 changes: 1 addition & 1 deletion tests/input/ancestry_multi_sep.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#Ancestry
#AncestryDNA
#
#
#
Expand Down
Loading

0 comments on commit 6489834

Please sign in to comment.