-
-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathbench.py
149 lines (121 loc) · 5.46 KB
/
bench.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
Benchmark comparison between acora search and re.findall()
"""
COMPARED_IMPLEMENTATIONS = ["pa", "ca", "re"]
REPEAT_COUNT = 5
import re
import sys
import timeit
from time import time
from itertools import combinations
from functools import partial
from acora import AcoraBuilder, BytesAcora, UnicodeAcora, PyAcora
def prepare_benchmark_data():
s = ('bdfdaskdjfhaslkdhfsadhfklashdflabcasdabcdJAKHDBVDFLNFCBLSADHFCALKSJ'
'jklhcnajskbhfasjhancfksjdfhbvaliuradefhzcbdegnashdgfbcjaabesdhgkfcnash'
'fdkhbdegxcbgjsvdhabcabcfcgbnxahsdbgfbcakjsdhgnfcxsababcmdabe')
s = s.lower() + s + s.upper()
search_string = s * 1000
all_keywords = [
'ab', 'abc', 'abcd', 'abcabc', 'ababc', 'ABBBC', 'ABCABC',
'bdfd', 'ade', 'abe', 'bdeg', 'fklash',
'gnfcxsababcmdabe', 'SADHFCAL',
'notthere', 'not-to-be-found', 'not-to-be-found-either',
]
if sys.version_info[0] < 3:
all_keywords = list(map(unicode, all_keywords))
search_string = unicode(search_string)
return search_string, all_keywords
def compare_search(s, filename, ignore_case, *keywords):
setup_pya = setup_cya = setup_re = 0
run_pa = 'pa' in COMPARED_IMPLEMENTATIONS
run_ca = 'ca' in COMPARED_IMPLEMENTATIONS
run_re = 're' in COMPARED_IMPLEMENTATIONS
if run_pa:
t = time()
builder = AcoraBuilder(keywords, ignore_case=ignore_case)
py_acora = builder.build(acora=PyAcora)
setup_pya = time() - t
t = time()
if run_ca:
t = time()
builder = AcoraBuilder(keywords, ignore_case=ignore_case)
c_acora = builder.build()
setup_ca = time() - t
if run_re:
t = time()
if hasattr(keywords[0], 'encode'): # unicode in Py3?
kw_regexp = '|'.join(keywords)
else:
kw_regexp = '|'.encode('ASCII').join(keywords)
if ignore_case:
regexp = re.compile(kw_regexp, re.I)
else:
regexp = re.compile(kw_regexp)
setup_re = time() - t
print("Case %ssensitive %s\n- setup times: PA: %.4f, CA: %.4f, RE: %.4f" % (
ignore_case and 'in' or '',
builder.for_unicode and 'unicode' or 'bytes',
setup_pya, setup_ca, setup_re))
if run_pa:
timings = timeit.Timer(partial(py_acora.findall, s)).repeat(number=REPEAT_COUNT)
print("TIME(paS): %.3f" % min(timings))
if run_ca:
timings = timeit.Timer(partial(c_acora.findall, s)).repeat(number=REPEAT_COUNT)
print("TIME(caS): %.3f" % min(timings))
if filename:
if run_pa:
timings = timeit.Timer(partial(py_acora.filefindall, filename)).repeat(number=REPEAT_COUNT)
print("TIME(paF): %.3f" % min(timings))
if run_ca:
timings = timeit.Timer(partial(c_acora.filefindall, filename)).repeat(number=REPEAT_COUNT)
print("TIME(caF): %.3f" % min(timings))
if run_re:
timings = timeit.Timer(partial(regexp.findall, s)).repeat(number=REPEAT_COUNT)
print("TIME(reS): %.3f" % min(timings))
return (
run_pa and py_acora.findall(s) or None,
run_ca and c_acora.findall(s) or None,
run_pa and (filename and py_acora.filefindall(filename)) or None,
run_ca and (filename and c_acora.filefindall(filename)) or None,
run_re and regexp.findall(s) or None
)
def run_benchmark(search_string, all_keywords):
search_string_lower = search_string.lower()
bytes_search_string = search_string.encode('ASCII')
bytes_search_string_lower = search_string_lower.encode('ASCII')
import tempfile
temp_text_file = tempfile.NamedTemporaryFile()
temp_text_file.write(bytes_search_string)
temp_text_file.flush()
filename = temp_text_file.name
for i in range(len(all_keywords),0,-1):
for keywords in combinations(all_keywords, i):
print('##Keywords(%d): %s' % (len(keywords), ' '.join(sorted(keywords))))
keywords_lower = [ kw.lower() for kw in keywords ]
results = compare_search(search_string, None, False, *keywords)
for result in results[:2]:
assert_equal(results, result, search_string, keywords)
results = compare_search(search_string, None, True, *keywords)
for result in results[:2]:
assert_equal(results, result, search_string_lower, keywords_lower)
keywords = [ keyword.encode('ASCII') for keyword in keywords ]
results = compare_search(bytes_search_string, filename, False, *keywords)
for result in results[:4]:
assert_equal(results, result, bytes_search_string, keywords)
if sys.version_info[0] < 3:
keywords_lower = [ keyword.encode('ASCII') for keyword in keywords_lower ]
# case-insensitive search in byte strings is not supported in Py3
results = compare_search(bytes_search_string, filename, True, *keywords)
for result in results[:4]:
assert_equal(results, result, bytes_search_string_lower, keywords_lower)
def assert_equal(results, result, search_string, keywords):
if result is None:
return
assert len(result) == sum(map(search_string.count, keywords)), \
"EXPECTED: %d, got %s, %s" % (
sum(map(search_string.count, keywords)),
len(result),
[(len(res) if res is not None else None) for res in results])
if __name__ == '__main__':
run_benchmark(*prepare_benchmark_data())