-
Notifications
You must be signed in to change notification settings - Fork 69
/
homoglyphs.py
265 lines (226 loc) · 8.18 KB
/
homoglyphs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
"""Updated version of core.py from
https://github.com/yamatt/homoglyphs/tree/main/homoglyphs_fork
for modern python3
"""
from collections import defaultdict
import json
from itertools import product
import os
import unicodedata
# Actions if char not in alphabet
STRATEGY_LOAD = 1 # load category for this char
STRATEGY_IGNORE = 2 # add char to result
STRATEGY_REMOVE = 3 # remove char from result
ASCII_RANGE = range(128)
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_LOCATION = os.path.join(CURRENT_DIR, "homoglyph_data")
class Categories:
"""
Work with aliases from ISO 15924.
https://en.wikipedia.org/wiki/ISO_15924#List_of_codes
"""
fpath = os.path.join(DATA_LOCATION, "categories.json")
@classmethod
def _get_ranges(cls, categories):
"""
:return: iter: (start code, end code)
:rtype: list
"""
with open(cls.fpath, encoding="utf-8") as f:
data = json.load(f)
for category in categories:
if category not in data["aliases"]:
raise ValueError("Invalid category: {}".format(category))
for point in data["points"]:
if point[2] in categories:
yield point[:2]
@classmethod
def get_alphabet(cls, categories):
"""
:return: set of chars in alphabet by categories list
:rtype: set
"""
alphabet = set()
for start, end in cls._get_ranges(categories):
chars = (chr(code) for code in range(start, end + 1))
alphabet.update(chars)
return alphabet
@classmethod
def detect(cls, char):
"""
:return: category
:rtype: str
"""
with open(cls.fpath, encoding="utf-8") as f:
data = json.load(f)
# try detect category by unicodedata
try:
category = unicodedata.name(char).split()[0]
except (TypeError, ValueError):
# In Python2 unicodedata.name raise error for non-unicode chars
# Python3 raise ValueError for non-unicode characters
pass
else:
if category in data["aliases"]:
return category
# try detect category by ranges from JSON file.
code = ord(char)
for point in data["points"]:
if point[0] <= code <= point[1]:
return point[2]
@classmethod
def get_all(cls):
with open(cls.fpath, encoding="utf-8") as f:
data = json.load(f)
return set(data["aliases"])
class Languages:
fpath = os.path.join(DATA_LOCATION, "languages.json")
@classmethod
def get_alphabet(cls, languages):
"""
:return: set of chars in alphabet by languages list
:rtype: set
"""
with open(cls.fpath, encoding="utf-8") as f:
data = json.load(f)
alphabet = set()
for lang in languages:
if lang not in data:
raise ValueError("Invalid language code: {}".format(lang))
alphabet.update(data[lang])
return alphabet
@classmethod
def detect(cls, char):
"""
:return: set of languages which alphabet contains passed char.
:rtype: set
"""
with open(cls.fpath, encoding="utf-8") as f:
data = json.load(f)
languages = set()
for lang, alphabet in data.items():
if char in alphabet:
languages.add(lang)
return languages
@classmethod
def get_all(cls):
with open(cls.fpath, encoding="utf-8") as f:
data = json.load(f)
return set(data.keys())
class Homoglyphs:
def __init__(
self,
categories=None,
languages=None,
alphabet=None,
strategy=STRATEGY_IGNORE,
ascii_strategy=STRATEGY_IGNORE,
ascii_range=ASCII_RANGE,
):
# strategies
if strategy not in (STRATEGY_LOAD, STRATEGY_IGNORE, STRATEGY_REMOVE):
raise ValueError("Invalid strategy")
self.strategy = strategy
self.ascii_strategy = ascii_strategy
self.ascii_range = ascii_range
# Homoglyphs must be initialized by any alphabet for correct work
if not categories and not languages and not alphabet:
categories = ("LATIN", "COMMON")
# cats and langs
self.categories = set(categories or [])
self.languages = set(languages or [])
# alphabet
self.alphabet = set(alphabet or [])
if self.categories:
alphabet = Categories.get_alphabet(self.categories)
self.alphabet.update(alphabet)
if self.languages:
alphabet = Languages.get_alphabet(self.languages)
self.alphabet.update(alphabet)
self.table = self.get_table(self.alphabet)
@staticmethod
def get_table(alphabet):
table = defaultdict(set)
with open(os.path.join(DATA_LOCATION, "confusables_sept2022.json")) as f:
data = json.load(f)
for char in alphabet:
if char in data:
for homoglyph in data[char]:
if homoglyph in alphabet:
table[char].add(homoglyph)
return table
@staticmethod
def get_restricted_table(source_alphabet, target_alphabet):
table = defaultdict(set)
with open(os.path.join(DATA_LOCATION, "confusables_sept2022.json")) as f:
data = json.load(f)
for char in source_alphabet:
if char in data:
for homoglyph in data[char]:
if homoglyph in target_alphabet:
table[char].add(homoglyph)
return table
@staticmethod
def uniq_and_sort(data):
result = list(set(data))
result.sort(key=lambda x: (-len(x), x))
return result
def _update_alphabet(self, char):
# try detect languages
langs = Languages.detect(char)
if langs:
self.languages.update(langs)
alphabet = Languages.get_alphabet(langs)
self.alphabet.update(alphabet)
else:
# try detect categories
category = Categories.detect(char)
if category is None:
return False
self.categories.add(category)
alphabet = Categories.get_alphabet([category])
self.alphabet.update(alphabet)
# update table for new alphabet
self.table = self.get_table(self.alphabet)
return True
def _get_char_variants(self, char):
if char not in self.alphabet:
if self.strategy == STRATEGY_LOAD:
if not self._update_alphabet(char):
return []
elif self.strategy == STRATEGY_IGNORE:
return [char]
elif self.strategy == STRATEGY_REMOVE:
return []
# find alternative chars for current char
alt_chars = self.table.get(char, set())
if alt_chars:
# find alternative chars for alternative chars for current char
alt_chars2 = [self.table.get(alt_char, set()) for alt_char in alt_chars]
# combine all alternatives
alt_chars.update(*alt_chars2)
# add current char to alternatives
alt_chars.add(char)
# uniq, sort and return
return self.uniq_and_sort(alt_chars)
def _get_combinations(self, text, ascii=False):
variations = []
for char in text:
alt_chars = self._get_char_variants(char)
if ascii:
alt_chars = [char for char in alt_chars if ord(char) in self.ascii_range]
if not alt_chars and self.ascii_strategy == STRATEGY_IGNORE:
return
if alt_chars:
variations.append(alt_chars)
if variations:
for variant in product(*variations):
yield "".join(variant)
def get_combinations(self, text):
return list(self._get_combinations(text))
def _to_ascii(self, text):
for variant in self._get_combinations(text, ascii=True):
if max(map(ord, variant)) in self.ascii_range:
yield variant
def to_ascii(self, text):
return self.uniq_and_sort(self._to_ascii(text))