Skip to content

Commit

Permalink
Merge pull request #182 from apriha/feature/parse-plink
Browse files Browse the repository at this point in the history
Parse PLINK
  • Loading branch information
apriha committed Aug 21, 2024
2 parents 8aac5ad + 813fe86 commit 7df10b6
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ genotype files from the following DNA testing sources:
- `LivingDNA <https://livingdna.com>`_
- `Mapmygenome <https://mapmygenome.in>`_
- `MyHeritage <https://www.myheritage.com>`_
- `PLINK <https://www.cog-genomics.org/plink/>`_
- `Sano Genetics <https://sanogenetics.com>`_
- `tellmeGen <https://www.tellmegen.com>`_

Expand Down
86 changes: 86 additions & 0 deletions src/snps/io/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,8 @@ def read(self):
d = self.read_gsa(file, compression, comments)
elif "Circle" in first_line:
d = self.read_circledna(file, compression)
elif "# Below is a text version of your data." in comments:
d = self.read_plink(file, compression)

# detect build from comments if build was not already detected from `read` method
if not d["build"]:
Expand Down Expand Up @@ -1180,6 +1182,90 @@ def parser():

return self.read_helper("Sano", parser)

def read_plink(self, file, compression):
"""Read and parse plink file.
Parameters
----------
file : str
path to file
Returns
-------
dict
result of `read_helper`
"""

def parser():
columnnames = ["rsid", "chrom", "pos", "genotype"]
df = pd.read_csv(
file,
comment="#",
sep="\t",
na_values=["--", "-"],
names=columnnames,
compression=compression,
)
# turn number numbers into string numbers
df["chrom"] = df["chrom"].map(
{
"1": "1",
"2": "2",
"3": "3",
"4": "4",
"5": "5",
"6": "6",
"7": "7",
"8": "8",
"9": "9",
"10": "10",
"11": "11",
"12": "12",
"13": "13",
"14": "14",
"15": "15",
"16": "16",
"17": "17",
"18": "18",
"19": "19",
"20": "20",
"21": "21",
"22": "22",
1: "1",
2: "2",
3: "3",
4: "4",
5: "5",
6: "6",
7: "7",
8: "8",
9: "9",
10: "10",
11: "11",
12: "12",
13: "13",
14: "14",
15: "15",
16: "16",
17: "17",
18: "18",
19: "19",
20: "20",
21: "21",
22: "22",
"X": "X",
"Y": "Y",
"XY": "Y",
"MT": "MT",
}
)
df = df.dropna(subset=["rsid", "chrom", "pos"])
df = df.astype(dtype=NORMALIZED_DTYPES)
df = df.set_index("rsid")
return (df,)

return self.read_helper("PLINK", parser)

def read_snps_csv(self, file, comments, compression):
"""Read and parse CSV file generated by ``snps``.
Expand Down
10 changes: 10 additions & 0 deletions tests/input/plink.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Below is a text version of your data.
rsid chromosome position genotype
rs1 1 101 AA
rs2 1 102 CC
rs3 1 103 GG
rs4 1 104 TT
rs5 1 105 --
rs6 1 106 GC
rs7 1 107 TC
rs8 1 108 AT
3 changes: 3 additions & 0 deletions tests/io/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,9 @@ def test_read_circledna(self):
df.drop("rs5", inplace=True) # only called genotypes
self.run_parsing_tests("tests/input/circledna.txt", "CircleDNA", snps_df=df)

def test_read_plink(self):
self.run_parsing_tests("tests/input/plink.txt", "PLINK")

def test_read_ftdna(self):
# https://www.familytreedna.com
self.run_parsing_tests("tests/input/ftdna.csv", "FTDNA")
Expand Down

0 comments on commit 7df10b6

Please sign in to comment.