diff --git a/README.rst b/README.rst index 7fb8bec..024359b 100644 --- a/README.rst +++ b/README.rst @@ -58,6 +58,7 @@ genotype files from the following DNA testing sources: - `LivingDNA `_ - `Mapmygenome `_ - `MyHeritage `_ +- `PLINK `_ - `Sano Genetics `_ - `tellmeGen `_ diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py index 692fbce..f21895f 100644 --- a/src/snps/io/reader.py +++ b/src/snps/io/reader.py @@ -173,6 +173,8 @@ def read(self): d = self.read_gsa(file, compression, comments) elif "Circle" in first_line: d = self.read_circledna(file, compression) + elif "# Below is a text version of your data." in comments: + d = self.read_plink(file, compression) # detect build from comments if build was not already detected from `read` method if not d["build"]: @@ -1180,6 +1182,90 @@ def parser(): return self.read_helper("Sano", parser) + def read_plink(self, file, compression): + """Read and parse plink file. + + Parameters + ---------- + file : str + path to file + + Returns + ------- + dict + result of `read_helper` + """ + + def parser(): + columnnames = ["rsid", "chrom", "pos", "genotype"] + df = pd.read_csv( + file, + comment="#", + sep="\t", + na_values=["--", "-"], + names=columnnames, + compression=compression, + ) + # turn number numbers into string numbers + df["chrom"] = df["chrom"].map( + { + "1": "1", + "2": "2", + "3": "3", + "4": "4", + "5": "5", + "6": "6", + "7": "7", + "8": "8", + "9": "9", + "10": "10", + "11": "11", + "12": "12", + "13": "13", + "14": "14", + "15": "15", + "16": "16", + "17": "17", + "18": "18", + "19": "19", + "20": "20", + "21": "21", + "22": "22", + 1: "1", + 2: "2", + 3: "3", + 4: "4", + 5: "5", + 6: "6", + 7: "7", + 8: "8", + 9: "9", + 10: "10", + 11: "11", + 12: "12", + 13: "13", + 14: "14", + 15: "15", + 16: "16", + 17: "17", + 18: "18", + 19: "19", + 20: "20", + 21: "21", + 22: "22", + "X": "X", + "Y": "Y", + "XY": "Y", + "MT": "MT", + } + ) + df = df.dropna(subset=["rsid", "chrom", "pos"]) + df = df.astype(dtype=NORMALIZED_DTYPES) + df = df.set_index("rsid") + return (df,) + + return self.read_helper("PLINK", parser) + def read_snps_csv(self, file, comments, compression): """Read and parse CSV file generated by ``snps``. diff --git a/tests/input/plink.txt b/tests/input/plink.txt new file mode 100644 index 0000000..a8a797d --- /dev/null +++ b/tests/input/plink.txt @@ -0,0 +1,10 @@ +# Below is a text version of your data. +rsid chromosome position genotype +rs1 1 101 AA +rs2 1 102 CC +rs3 1 103 GG +rs4 1 104 TT +rs5 1 105 -- +rs6 1 106 GC +rs7 1 107 TC +rs8 1 108 AT diff --git a/tests/io/test_reader.py b/tests/io/test_reader.py index a61ee4a..0dc6bee 100644 --- a/tests/io/test_reader.py +++ b/tests/io/test_reader.py @@ -154,6 +154,9 @@ def test_read_circledna(self): df.drop("rs5", inplace=True) # only called genotypes self.run_parsing_tests("tests/input/circledna.txt", "CircleDNA", snps_df=df) + def test_read_plink(self): + self.run_parsing_tests("tests/input/plink.txt", "PLINK") + def test_read_ftdna(self): # https://www.familytreedna.com self.run_parsing_tests("tests/input/ftdna.csv", "FTDNA")