-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgene_type_retrieval.py
92 lines (78 loc) · 2.83 KB
/
gene_type_retrieval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
The purpose of this Python script is to retrieve the gene type of the
genes for which the mapping to either Swiss-Prot or TrEMBL was
successful. To this end, the NCBI database is queried.
"""
import time
import pandas as pd
from biotite.database import entrez
gene_type_df = pd.read_csv(
"gene_type_query.tsv",
sep="\t"
)
gene_IDs_with_types_not_covered = []
for i, gene_ID in enumerate(gene_type_df["Gene_ID"]):
time.sleep(1)
for _ in range(3):
try:
NCBI_entry = entrez.fetch_single_file(
uids=[str(gene_ID)],
file_name=None,
db_name="gene",
ret_type="xml"
)
# As the file_name is specified to be None, Biotite's
# fetch_single_file() function returns a StringIO object
# It's content can be accessed via the getvalue() method
# Note that the getvalue() method is preferred to the
# read() method as the latter moves the cursor to the last
# index so that repeatedly using the read() method returns
# an empty string
NCBI_entry_str = NCBI_entry.getvalue()
# It sometimes happens that the queried information is
# incompletely transmitted
# In such a case, the query has to be repeated
assert "<Entrezgene_type value=" in NCBI_entry_str
break
except:
time.sleep(1)
else:
print(
"NCBI database query wasn't successful for gene ID "
f"{gene_ID}."
)
continue
NCBI_entry_str_list = NCBI_entry_str.split("\n")
# Determine the index of the line containing the desired information
gene_type_line_index = [
i for i, line in enumerate(NCBI_entry_str_list)
if "<Entrezgene_type value=" in line
][0]
gene_type_line = NCBI_entry_str_list[gene_type_line_index]
if "pseudo" in gene_type_line:
gene_type_df.at[i, "Gene_type"] = "pseudo"
elif "ncRNA" in gene_type_line:
gene_type_df.at[i, "Gene_type"] = "ncRNA"
elif "other" in gene_type_line:
gene_type_df.at[i, "Gene_type"] = "other"
elif "snoRNA" in gene_type_line:
gene_type_df.at[i, "Gene_type"] = "snoRNA"
elif "protein-coding" in gene_type_line:
gene_type_df.at[i, "Gene_type"] = "protein-coding"
elif "unknown" in gene_type_line:
gene_type_df.at[i, "Gene_type"] = "unknown"
else:
gene_IDs_with_types_not_covered.append(gene_ID)
if len(gene_IDs_with_types_not_covered) > 0:
print(
"The following gene IDs have gene types other than the ones "
"covered:\n",
gene_IDs_with_types_not_covered,
sep=""
)
# Overwrite the original TSV file with the updated DataFrame
gene_type_df.to_csv(
"gene_type_query.tsv",
sep="\t",
index=False
)