-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
227 lines (182 loc) · 7.51 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import pandas as pd
from pandas.io.json import json_normalize
import re
from requests_cache import CachedSession
import datetime
import tempfile
from zipfile import ZipFile
import semver
EXPIRE_AFTER_DEFAULT = datetime.timedelta(days=7)
MAX_ROWS_DEFAULT = 10
THRESHOLD_DEFAULT = 0.2
SEARCH_IN_DEFAULT = ['name', 'paragraph', 'sentence', 'category']
def init_session(expire_after=EXPIRE_AFTER_DEFAULT, session=None):
if session is None:
session = CachedSession(cache_name='cache/cache', backend='sqlite',
expire_after=expire_after)
return session
def init_index(df=None, session=None, ascending=False):
if df is None:
return download_index(session, ascending)
return df
def download_index(session=None, ascending=False):
if session is None:
session = init_session(session)
url = "http://downloads.arduino.cc/libraries/library_index.json"
r = session.get(url)
d = r.json()
libraries = d['libraries']
df = json_normalize(libraries)
df['version'] = df['version'].map(semver.parse_version_info)
# raises TypeError: unhashable type: 'VersionInfo'
# see https://github.com/k-bx/python-semver/issues/73
df = df.sort_values(by=['name', 'version'], ascending=[True, ascending])
return df
def repository_url_from_archive_url(url):
pattern = re.compile(r"http:\/\/downloads\.arduino\.cc\/libraries\/(.*)/(.*)\/(.*)-(.*).zip")
match = pattern.match(url)
allowed_websites = ['github.com']
if match is None:
raise NotImplementedError("Can't match")
website, username, repository, version = match.groups()
if website == 'github.com':
repository_url = "https://github.com/%s/%s" % (username, repository)
else:
raise NotImplementedError("Website is %r but only website in %r are currently supported" % (website, allowed_websites))
return repository_url
def add_possible_repository_url(df):
def _possible_repository_url(url):
try:
repository_url = repository_url_from_archive_url(url)
# repository_url = repository_url.lower()
return repository_url
except Exception as e:
return ""
df['possible_url_repository'] = df['url'].map(_possible_repository_url)
df['have_repo_like_archive'] = df['website'].str.lower() == df['possible_url_repository'].str.lower()
return df
def dice_coefficient(a, b):
"""
From https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient
"""
if not len(a) or not len(b):
return 0.0
if a == b:
return 1.0
if len(a) == 1 or len(b) == 1:
return 0.0
a_bigram_list = [a[i:i + 2] for i in range(len(a) - 1)]
b_bigram_list = [b[i:i + 2] for i in range(len(b) - 1)]
a_bigram_list.sort()
b_bigram_list.sort()
# assignments to save function calls
lena = len(a_bigram_list)
lenb = len(b_bigram_list)
# initialize match counters
matches = 0
i = 0
j = 0
while (i < lena and j < lenb):
if a_bigram_list[i] == b_bigram_list[j]:
matches += 2
i += 1
j += 1
elif a_bigram_list[i] < b_bigram_list[j]:
i += 1
else:
j += 1
score = float(matches) / float(lena + lenb)
return score
def find_nearest_names(name, threshold=THRESHOLD_DEFAULT, df=None):
df = init_index(df)
library_names = df.name.unique()
df_nearest = pd.DataFrame()
df_nearest['name'] = library_names
name_upper = name.upper()
df_nearest['score'] = df_nearest['name'].str.upper().map(lambda n: dice_coefficient(n, name_upper))
df_nearest = df_nearest[df_nearest['score'] >= threshold]
df_nearest = df_nearest.sort_values(by='score', ascending=False)
df_nearest.index = range(1, len(df_nearest) + 1)
return df_nearest
def find_nearest_name(name, threshold=THRESHOLD_DEFAULT, df=None):
df_nearest = find_nearest_names(name, threshold, df)
if len(df_nearest) == 0:
raise NotImplementedError("Can't find a nearest name with such a threshold (%f) try reducing or with a better name than %s" % (threshold, name))
return df_nearest['name'].iloc[0]
def get_versions(name, df=None, ascending=False):
df = init_index(df)
df_library = df[df['name'] == name]
df_library = df_library.sort_values(by='version', ascending=True)
df_library.index = range(1, len(df_library) + 1)
df_library = df_library.sort_values(by='version', ascending=ascending)
return df_library['version']
def get_latest_version(name, df=None):
return get_versions(name, ascending=True, df=df).iloc[-1]
def init_version(name, version=None, df=None):
if version is None:
return get_latest_version(name, df)
else:
return version
def get_archive_url(name, version, df):
df_library = df[df['name'] == name]
df_library_version = df_library[df_library['version'] == version]
n = len(df_library_version)
if n == 0:
raise Exception("No library matching name '{name}' have been found try `find_nearest_name('{name}') or `find_nearest_names('{name}')`".format(name=name))
elif n > 1:
raise Exception("Only one library with a given version should be found - several have been found")
else: # n==1
return df_library_version.url.iloc[0]
def show_licence_from_archive_url(url, session=None):
session = init_session(session)
r = session.get(url)
raw = r.content
with tempfile.TemporaryFile() as tmpf:
tmpf.write(raw)
with ZipFile(tmpf, 'r') as zf:
license_filenames = []
for filename in zf.namelist():
filename_upper = filename.upper()
if 'LICENSE' in filename_upper or 'LICENCE' in filename_upper:
license_filenames.append(filename)
print()
print(filename)
print()
data = zf.open(filename).read().decode()
print(data)
n_licenses = len(license_filenames)
if n_licenses == 0:
print("No license have been found")
elif n_licenses > 1:
print("Several license files have been found")
return n_licenses
def show_licence(name, version=None, session=None):
df = download_index(session)
version = init_version(name, version=version, df=df)
url = get_archive_url(name, version, df)
show_licence_from_archive_url(url, session)
def show_versions(name, session=None):
df = download_index(session)
versions = get_versions(name, df=None, ascending=False)
if len(versions) == 0:
print(find_nearest_names(name, df=df))
raise Exception("No library matching name '{name}' have been found try `find_nearest_name('{name}')` or `find_nearest_names('{name}')`".format(name=name))
return versions
def search(keywords, columns_search=None, session=None, df_last=None):
if df_last is None:
df = download_index(session)
df_last = df.groupby('name').first().reset_index()
if columns_search is None:
columns_search = SEARCH_IN_DEFAULT
keywords = keywords.split(" ")
ser_have_keyword = pd.Series(False, index=df_last.index)
for keyword in keywords:
for col in columns_search:
ser_have_keyword = ser_have_keyword | df_last[col].str.contains(keyword, case=False).fillna(False)
# ser_have_keyword = ser_have_keyword & ...
df_search = df_last[ser_have_keyword]
n_found = len(df_search)
print(df_search.set_index('name'))
print()
print("Found %d libraries" % n_found)
return df_search