forked from BRCAChallenge/literature-search
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatch.py
105 lines (82 loc) · 3.79 KB
/
match.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"""
Match variants found in papers to variants in BRCA Exchange
"""
import re
import functools
import pandas as pd
import hgvs.parser
@functools.lru_cache(maxsize=None)
def parse_hgvs(parser, candidate):
assert candidate
# Normalize single deletions: NM_007300.3:c.1100C>None -> NM_007300.3:c.1100delC
candidate = re.sub(r"(NM.*c\.\d*)([ATCGatcg]+)(>None)", r"\1del\2", candidate)
# TODO: Normalize multiple deletions and delins
# TODO: Limit to only specific BRCA transcripts
# ex: 23199084 NM_007294.3:c.2681AA>None|NM_007300.3:c.2681AA>None
try:
return str(parser.parse_hgvs_variant(candidate))
except hgvs.exceptions.HGVSParseError:
print("Failed to parse: {}".format(candidate))
return ""
def next_mention(row, parser):
matched = False
parsed_c_hgvs = ""
for raw_hgvs in set([r.strip() for r in row.hgvsCoding.split("|")]):
if not raw_hgvs:
continue
parsed_c_hgvs = parse_hgvs(parser, raw_hgvs)
if not parsed_c_hgvs:
continue
# Try parsed hgvsCoding to pyhgvs_cDNA
hits = variants[variants.pyhgvs_cDNA.str.contains(parsed_c_hgvs)]
if hits.shape[0] == 1:
matched = True
yield (hits.iloc[0].pyhgvs_Genomic_Coordinate_38, row.docId, row.mutSnippets, 10)
elif hits.shape[0] > 1:
print("ERROR: Multiple matches on {} {} against ".format(raw_hgvs, parsed_c_hgvs))
# print(hits)
print("Continuing...")
# Try parsed hgvsCoding to synonym (BRCA Exchange synonyms replace : with .)
for i, hit in variants.loc[variants.Synonyms.str.contains(
parsed_c_hgvs.replace(":", "."), regex=False)].iterrows():
matched = True
yield (hit.pyhgvs_Genomic_Coordinate_38, row.docId, row.mutSnippets, 7)
# Try texts to synonym
# Note: Could always run but adds 40+ per variant...so only run if nothing else works
if not matched:
for text in set([t.strip() for t in row.texts.split("|")]):
# Skip very short texts, i.e. M4N to reduce false positive
if len(text) < 6:
continue
for i, hit in variants[
variants.Synonyms.str.contains(text, regex=False)].iterrows():
matched = True
# Longer matches score higher...
yield (hit.pyhgvs_Genomic_Coordinate_38,
row.docId, row.mutSnippets, len(text) - 5)
# if not matched:
# print("Failed to match: hgvsCoding={} Mapped={} Texts={}".format(
# raw_hgvs, parsed_c_hgvs, row.texts))
if __name__ == "__main__":
print("Creating HGVS parser...")
parser = hgvs.parser.Parser()
print("Loading variants...")
variants = pd.read_csv("/crawl/built_with_change_types.tsv",
sep="\t", header=0, encoding="utf-8",
usecols=["pyhgvs_Genomic_Coordinate_38", "pyhgvs_cDNA", "Synonyms"])
print("Found {} variants".format(variants.shape[0]))
print("Loading mentions...")
mentions = pd.read_csv("/crawl/mutations-trimmed.tsv",
sep="\t", header=0, encoding="utf-8", dtype="str")
mentions = mentions.fillna("")
mentions = mentions[mentions.mutSnippets != ""]
mentions = mentions[(mentions.hgvsCoding != "") | (mentions.texts != "")]
mentions = mentions.sort_values(["docId", "hgvsCoding"])
print("Found {} mentions".format(mentions.shape[0]))
hits = [m for _, row in mentions.iterrows() for m in next_mention(row, parser)]
matches = pd.DataFrame(
hits,
columns=["pyhgvs_Genomic_Coordinate_38", "pmid", "snippets", "points"]
)
matches = matches.set_index("pyhgvs_Genomic_Coordinate_38", drop=True)
matches.to_csv("/crawl/mentions-matched.tsv", sep="\t")