-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetBiosamples.py
53 lines (50 loc) · 1.99 KB
/
getBiosamples.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from tqdm import tqdm
from elasticsearch import Elasticsearch
import os
with open("SPARC_accessions.txt", "r") as inFile:
accessions = inFile.read().splitlines()
biosamples = []
searchURL = os.environ.get("ELASTIC_API_URL")
apiID = os.environ.get("ELASTIC_ISOLATE_API_ID")
apiKEY = os.environ.get("ELASTIC_ISOLATE_API_KEY")
indexName = os.environ.get("ELASTIC_ISOLATE_NAME")
for access in tqdm(accessions):
# apply filters to the elasitcsearch output
numResults = 10
fetchData = {"size": numResults,
"track_total_hits": True,
"query": {
"bool": {
"must" : {
"multi_match" : {
"query": access,
"fields" : [
"isolateName",
"isolateNameUnderscore",
"Assembly_name",
"Infraspecific_name",
"GenBank_assembly_accession",
"RefSeq_assembly_and_GenBank_assemblies_identical",
"BioSample",
"read_accession",
"run_accession",
],
"operator": "or",
"fuzziness": "AUTO",
}
}
}
}
}
client = Elasticsearch([searchURL],
api_key=(apiID, apiKEY))
isolateResult = client.search(index = indexName,
body = fetchData,
request_timeout = 60)
if not len(isolateResult["hits"]["hits"]) == 0:
try:
bio = isolateResult["hits"]["hits"][0]["_source"]["BioSample"]
biosamples.append(bio)
except:
print(isolateResult["hits"]["hits"][0]["_source"].keys())
print(len(biosamples))