Merge pull request #182 from apriha/feature/parse-plink

Parse PLINK
apriha · Aug 21, 2024 · 7df10b6 · 7df10b6
2 parents 8aac5ad + 813fe86
commit 7df10b6
Show file tree

Hide file tree

Showing 4 changed files with 100 additions and 0 deletions.
diff --git a/README.rst b/README.rst
@@ -58,6 +58,7 @@ genotype files from the following DNA testing sources:
 - `LivingDNA <https://livingdna.com>`_
 - `Mapmygenome <https://mapmygenome.in>`_
 - `MyHeritage <https://www.myheritage.com>`_
+- `PLINK <https://www.cog-genomics.org/plink/>`_
 - `Sano Genetics <https://sanogenetics.com>`_
 - `tellmeGen <https://www.tellmegen.com>`_
 

diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py
@@ -173,6 +173,8 @@ def read(self):
             d = self.read_gsa(file, compression, comments)
         elif "Circle" in first_line:
             d = self.read_circledna(file, compression)
+        elif "# Below is a text version of your data." in comments:
+            d = self.read_plink(file, compression)
 
         # detect build from comments if build was not already detected from `read` method
         if not d["build"]:
@@ -1180,6 +1182,90 @@ def parser():
 
         return self.read_helper("Sano", parser)
 
+    def read_plink(self, file, compression):
+        """Read and parse plink file.
+
+        Parameters
+        ----------
+        file : str
+            path to file
+
+        Returns
+        -------
+        dict
+            result of `read_helper`
+        """
+
+        def parser():
+            columnnames = ["rsid", "chrom", "pos", "genotype"]
+            df = pd.read_csv(
+                file,
+                comment="#",
+                sep="\t",
+                na_values=["--", "-"],
+                names=columnnames,
+                compression=compression,
+            )
+            # turn number numbers into string numbers
+            df["chrom"] = df["chrom"].map(
+                {
+                    "1": "1",
+                    "2": "2",
+                    "3": "3",
+                    "4": "4",
+                    "5": "5",
+                    "6": "6",
+                    "7": "7",
+                    "8": "8",
+                    "9": "9",
+                    "10": "10",
+                    "11": "11",
+                    "12": "12",
+                    "13": "13",
+                    "14": "14",
+                    "15": "15",
+                    "16": "16",
+                    "17": "17",
+                    "18": "18",
+                    "19": "19",
+                    "20": "20",
+                    "21": "21",
+                    "22": "22",
+                    1: "1",
+                    2: "2",
+                    3: "3",
+                    4: "4",
+                    5: "5",
+                    6: "6",
+                    7: "7",
+                    8: "8",
+                    9: "9",
+                    10: "10",
+                    11: "11",
+                    12: "12",
+                    13: "13",
+                    14: "14",
+                    15: "15",
+                    16: "16",
+                    17: "17",
+                    18: "18",
+                    19: "19",
+                    20: "20",
+                    21: "21",
+                    22: "22",
+                    "X": "X",
+                    "Y": "Y",
+                    "XY": "Y",
+                    "MT": "MT",
+                }
+            )
+            df = df.dropna(subset=["rsid", "chrom", "pos"])
+            df = df.astype(dtype=NORMALIZED_DTYPES)
+            df = df.set_index("rsid")
+            return (df,)
+
+        return self.read_helper("PLINK", parser)
+
     def read_snps_csv(self, file, comments, compression):
         """Read and parse CSV file generated by ``snps``.
 

diff --git a/tests/input/plink.txt b/tests/input/plink.txt
@@ -0,0 +1,10 @@
+# Below is a text version of your data.
+rsid	chromosome	position	genotype
+rs1	1	101	AA
+rs2	1	102	CC
+rs3	1	103	GG
+rs4	1	104	TT
+rs5	1	105	--
+rs6	1	106	GC
+rs7	1	107	TC
+rs8	1	108	AT
diff --git a/tests/io/test_reader.py b/tests/io/test_reader.py
@@ -154,6 +154,9 @@ def test_read_circledna(self):
         df.drop("rs5", inplace=True)  # only called genotypes
         self.run_parsing_tests("tests/input/circledna.txt", "CircleDNA", snps_df=df)
 
+    def test_read_plink(self):
+        self.run_parsing_tests("tests/input/plink.txt", "PLINK")
+
     def test_read_ftdna(self):
         # https://www.familytreedna.com
         self.run_parsing_tests("tests/input/ftdna.csv", "FTDNA")