Merge pull request #161 from apriha/develop

v2.6.0
apriha · Aug 24, 2022 · 6489834 · 6489834
2 parents 431d0c6 + f66c01d
commit 6489834
Show file tree

Hide file tree

Showing 13 changed files with 212 additions and 70 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
diff --git a/README.rst b/README.rst
@@ -48,6 +48,7 @@ genotype files from the following DNA testing sources:
 
 - `23andMe <https://www.23andme.com>`_
 - `Ancestry <https://www.ancestry.com>`_
+- `CircleDNA <https://circledna.com/>`_
 - `Código 46 <https://codigo46.com.mx>`_
 - `DNA.Land <https://dna.land>`_
 - `Family Tree DNA <https://www.familytreedna.com>`_

diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py
@@ -156,8 +156,22 @@ def read(self):
             return d
 
         if "23andMe" in first_line:
-            d = self.read_23andme(file, compression)
-        elif "Ancestry" in first_line:
+            # some 23andMe files have separate alleles
+            if comments.endswith(
+                "# rsid\tchromosome\tposition\tallele1\tallele2\n"
+            ) or comments.endswith(
+                "# rsid\tchromosome\tposition\tallele1\tallele2\r\n"
+            ):
+                d = self.read_23andme(file, compression, joined=False)
+            # some 23andMe files have a combined genotype
+            elif comments.endswith(
+                "# rsid\tchromosome\tposition\tgenotype\n"
+            ) or comments.endswith("# rsid\tchromosome\tposition\tgenotype\r\n"):
+                d = self.read_23andme(file, compression, joined=True)
+            # something we havent seen before and can't handle
+            else:
+                return d
+        elif "AncestryDNA" in first_line:
             d = self.read_ancestry(file, compression)
         elif first_line.startswith("RSID"):
             d = self.read_ftdna(file, compression)
@@ -186,6 +200,8 @@ def read(self):
         elif first_line.startswith("[Header]"):
             # Global Screening Array, includes SANO and CODIGO46
             d = self.read_gsa(file, compression, comments)
+        elif "Circle" in first_line:
+            d = self.read_circledna(file, compression)
 
         # detect build from comments if build was not already detected from `read` method
         if not d["build"]:
@@ -430,7 +446,7 @@ def read_helper(self, source, parser):
 
         return {"snps": df, "source": source, "phased": phased, "build": build}
 
-    def read_23andme(self, file, compression):
+    def read_23andme(self, file, compression, joined=True):
         """Read and parse 23andMe file.
 
         https://www.23andme.com
@@ -447,15 +463,18 @@ def read_23andme(self, file, compression):
         """
 
         def parser():
+            if joined:
+                columnnames = ["rsid", "chrom", "pos", "genotype"]
+            else:
+                columnnames = ["rsid", "chrom", "pos", "allele1", "allele2"]
             df = pd.read_csv(
                 file,
                 comment="#",
                 sep="\t",
-                na_values="--",
-                names=["rsid", "chrom", "pos", "genotype"],
+                na_values=["--", "-"],
+                names=columnnames,
                 compression=compression,
             )
-            df = df.dropna(subset=["rsid", "chrom", "pos"])
             # turn number numbers into string numbers
             df["chrom"] = df["chrom"].map(
                 {
@@ -508,6 +527,12 @@ def parser():
                     "MT": "MT",
                 }
             )
+            if not joined:
+                # stick separate alleles together
+                df["genotype"] = df["allele1"] + df["allele2"]
+                del df["allele1"]
+                del df["allele2"]
+            df = df.dropna(subset=["rsid", "chrom", "pos"])
             df = df.astype(dtype=NORMALIZED_DTYPES)
             df = df.set_index("rsid")
             return (df,)
@@ -1116,6 +1141,60 @@ def parser():
 
         return self.read_helper("DNA.Land", parser)
 
+    def read_circledna(self, file, compression):
+        """Read and parse CircleDNA file.
+
+        https://circledna.com/
+
+        Notes
+        -----
+        This method attempts to read and parse a whole exome file, optionally compressed
+        with gzip or zip. Some assumptions are made throughout this process:
+
+            * SNPs that are not annotated with an RSID are skipped
+            * Insertions and deletions are skipped
+
+        Parameters
+        ----------
+        file : str or bytes
+            path to file or bytes to load
+
+        Returns
+        -------
+        dict
+            result of `read_helper`
+        """
+
+        def parser():
+            rs_chunks = []
+            with pd.read_csv(
+                file,
+                comment="#",
+                sep="\t",
+                chunksize=10000,
+                names=["rsid", "chrom", "pos", "genotype"],
+                compression=compression,
+            ) as reader:
+                for chunk in reader:
+
+                    # filter for SNPs with rsids
+                    tmp = chunk.loc[
+                        (chunk.rsid.str.startswith("rs"))
+                        & (chunk.genotype.str.len() == 3)
+                    ]
+                    if len(tmp) > 0:
+                        rs_chunks.append(tmp)
+
+            df = pd.concat(rs_chunks)
+            df.chrom = df.chrom.str[3:]
+            df.genotype = df.genotype.apply(lambda x: "".join(x.split("/")))
+            df = df.astype(NORMALIZED_DTYPES)
+            df.set_index("rsid", inplace=True)
+
+            return (df,)
+
+        return self.read_helper("CircleDNA", parser)
+
     def read_snps_csv(self, file, comments, compression):
         """Read and parse CSV file generated by ``snps``.
 

diff --git a/src/snps/io/writer.py b/src/snps/io/writer.py
@@ -50,7 +50,15 @@
 class Writer:
     """Class for writing SNPs to files."""
 
-    def __init__(self, snps=None, filename="", vcf=False, atomic=True, **kwargs):
+    def __init__(
+        self,
+        snps=None,
+        filename="",
+        vcf=False,
+        atomic=True,
+        vcf_alt_unavailable=".",
+        **kwargs,
+    ):
         """Initialize a `Writer`.
 
         Parameters
@@ -63,13 +71,16 @@ def __init__(self, snps=None, filename="", vcf=False, atomic=True, **kwargs):
             flag to save file as VCF
         atomic : bool
             atomically write output to a file on local filesystem
+        vcf_alt_unavailable : str
+            representation of VCF ALT allele when ALT is not able to be determined
         **kwargs
             additional parameters to `pandas.DataFrame.to_csv`
         """
         self._snps = snps
         self._filename = filename
         self._vcf = vcf
         self._atomic = atomic
+        self._vcf_alt_unavailable = vcf_alt_unavailable
         self._kwargs = kwargs
 
     def write(self):
@@ -79,7 +90,15 @@ def write(self):
             return (self._write_csv(),)
 
     @classmethod
-    def write_file(cls, snps=None, filename="", vcf=False, atomic=True, **kwargs):
+    def write_file(
+        cls,
+        snps=None,
+        filename="",
+        vcf=False,
+        atomic=True,
+        vcf_alt_unavailable=".",
+        **kwargs,
+    ):
         """Save SNPs to file.
 
         Parameters
@@ -92,6 +111,8 @@ def write_file(cls, snps=None, filename="", vcf=False, atomic=True, **kwargs):
             flag to save file as VCF
         atomic : bool
             atomically write output to a file on local filesystem
+        vcf_alt_unavailable : str
+            representation of VCF ALT allele when ALT is not able to be determined
         **kwargs
             additional parameters to `pandas.DataFrame.to_csv`
 
@@ -102,7 +123,14 @@ def write_file(cls, snps=None, filename="", vcf=False, atomic=True, **kwargs):
         discrepant_vcf_position : pd.DataFrame
             SNPs with discrepant positions discovered while saving VCF
         """
-        w = cls(snps=snps, filename=filename, vcf=vcf, atomic=atomic, **kwargs)
+        w = cls(
+            snps=snps,
+            filename=filename,
+            vcf=vcf,
+            atomic=atomic,
+            vcf_alt_unavailable=vcf_alt_unavailable,
+            **kwargs,
+        )
         return w.write()
 
     def _write_csv(self):
@@ -365,7 +393,7 @@ def _compute_alt(self, ref, genotype):
 
         if ref in genotype_alleles:
             if len(genotype_alleles) == 1:
-                return "N"
+                return self._vcf_alt_unavailable
             else:
                 genotype_alleles.remove(ref)
                 return genotype_alleles.pop(0)

diff --git a/src/snps/snps.py b/src/snps/snps.py
@@ -584,7 +584,9 @@ def valid(self):
         else:
             return True
 
-    def save(self, filename="", vcf=False, atomic=True, **kwargs):
+    def save(
+        self, filename="", vcf=False, atomic=True, vcf_alt_unavailable=".", **kwargs
+    ):
         """Save SNPs to file.
 
         Parameters
@@ -595,6 +597,8 @@ def save(self, filename="", vcf=False, atomic=True, **kwargs):
             flag to save file as VCF
         atomic : bool
             atomically write output to a file on local filesystem
+        vcf_alt_unavailable : str
+            representation of VCF ALT allele when ALT is not able to be determined
         **kwargs
             additional parameters to `pandas.DataFrame.to_csv`
 
@@ -612,7 +616,12 @@ def save(self, filename="", vcf=False, atomic=True, **kwargs):
             kwargs["sep"] = "\t"
 
         path, *extra = Writer.write_file(
-            snps=self, filename=filename, vcf=vcf, atomic=atomic, **kwargs
+            snps=self,
+            filename=filename,
+            vcf=vcf,
+            atomic=atomic,
+            vcf_alt_unavailable=vcf_alt_unavailable,
+            **kwargs,
         )
 
         if len(extra) == 1 and not extra[0].empty:

diff --git a/tests/input/23andme.txt b/tests/input/23andme.txt
@@ -21,3 +21,5 @@ rs5	1	105	--
 rs6	1	106	GC
 rs7	1	107	TC
 rs8	1	108	AT
+rs9	--	109	AT
+rs10	--	--	AT
diff --git a/tests/input/23andme_allele.txt b/tests/input/23andme_allele.txt
@@ -0,0 +1,23 @@
+# 23andMe
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+# rsid	chromosome	position	allele1	allele2
+rs1	1	101	A	A
+rs2	1	102	C	C
+rs3	1	103	G	G
+rs4	1	104	T	T
+rs5	1	105	-	-
+rs6	1	106	G	C
+rs7	1	107	T	C
+rs8	1	108	A	T
diff --git a/tests/input/23andme_win.txt b/tests/input/23andme_win.txt
@@ -0,0 +1,25 @@
+# 23andMe
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+# rsid	chromosome	position	genotype
+rs1	1	101	AA
+rs2	1	102	CC
+rs3	1	103	GG
+rs4	1	104	TT
+rs5	1	105	--
+rs6	1	106	GC
+rs7	1	107	TC
+rs8	1	108	AT
+rs9	--	109	AT
+rs10	--	--	AT
diff --git a/tests/input/ancestry.txt b/tests/input/ancestry.txt
@@ -1,4 +1,4 @@
-#Ancestry
+#AncestryDNA
 #
 #
 #

diff --git a/tests/input/ancestry_mt.txt b/tests/input/ancestry_mt.txt
@@ -1,4 +1,4 @@
-#Ancestry
+#AncestryDNA
 rsid	chromosome	position	allele1	allele2
 rs1	26	101	A	A
 rs2	26	102	0	0

diff --git a/tests/input/ancestry_multi_sep.txt b/tests/input/ancestry_multi_sep.txt
@@ -1,4 +1,4 @@
-#Ancestry
+#AncestryDNA
 #
 #
 #