-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathexample.py
29 lines (24 loc) · 919 Bytes
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import cProfile
from MinHash import MinHash, JaccardMatchFunction
from MinHash import Banding
from KwikCluster import kwik_cluster
def main():
number_hash_functions = 200
threshold = 0.5
file_name = 'test/synthetic.txt'
number_processes = 4
minhash = MinHash(number_hash_functions)
doc_ids = set()
with open(file_name) as ins:
for counter, line in enumerate(ins):
doc_ids.add(counter)
tokens = line.split(' ')
minhash.add_document(counter, tokens)
minhash.finish()
bands = Banding(number_hash_functions, threshold, number_processes=number_processes)
bands.add_signatures(minhash.signatures)
match_function = JaccardMatchFunction(minhash, bands).match_function
clusters = kwik_cluster(match_function, doc_ids)
print 'Finished with ', str(len(clusters)), ' clusters'
if __name__ == '__main__':
cProfile.run('main()')