-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathprep_elastic.py
65 lines (59 loc) · 2.02 KB
/
prep_elastic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from typing import List, Tuple, Union, Dict
import argparse
import glob
import time
import csv
import json
import logging
from tqdm import tqdm
from beir.datasets.data_loader import GenericDataLoader
def build_elasticsearch(
beir_corpus_file_pattern: str,
index_name: str,
):
beir_corpus_files = glob.glob(beir_corpus_file_pattern)
print(f'#files {len(beir_corpus_files)}')
from beir.retrieval.search.lexical.elastic_search import ElasticSearch
config = {
'hostname': 'localhost',
'index_name': index_name,
'keys': {'title': 'title', 'body': 'txt'},
'timeout': 100,
'retry_on_timeout': True,
'maxsize': 24,
'number_of_shards': 'default',
'language': 'english',
}
es = ElasticSearch(config)
# create index
print(f'create index {index_name}')
es.delete_index()
time.sleep(5)
es.create_index()
# generator
def generate_actions():
for beir_corpus_file in beir_corpus_files:
with open(beir_corpus_file, 'r') as fin:
reader = csv.reader(fin, delimiter='\t')
header = next(reader) # skip header
for row in reader:
_id, text, title = row[0], row[1], row[2]
es_doc = {
'_id': _id,
'_op_type': 'index',
'refresh': 'wait_for',
config['keys']['title']: title,
config['keys']['body']: text,
}
yield es_doc
# index
progress = tqdm(unit='docs')
es.bulk_add_to_index(
generate_actions=generate_actions(),
progress=progress)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--data_path', type=str, default=None, help='input file')
parser.add_argument("--index_name", type=str, default=None, help="index name")
args = parser.parse_args()
build_elasticsearch(args.data_path, index_name=args.index_name)