-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path过滤存在拼音及单词的域名.py
82 lines (60 loc) · 2.66 KB
/
过滤存在拼音及单词的域名.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#coding=utf-8
'''
通过正则表达式检测域名内是否存在可读字符串
完成字符串检测
TODO:从benigns.txt中筛出误判断的恶意域名
--2020.09.17
单词与拼音分开检测,单词只需出现一次,
拼音需出现两次以上才判定为良性域名
TODO: KMeans分类出不同家族域名
--2020.09.18
'''
import re
"""
Params:
words:
"""
domains = []
benigns = []
malicious = []
word_file = r'C:\Users\Administrator\Desktop\DNS恶意域名检测\2020Datacon.dns恶意域名\dns_2_question\words.txt'
pinyin_file = r'C:\Users\Administrator\Desktop\DNS恶意域名检测\2020Datacon.dns恶意域名\dns_2_question\拼音.txt'
domains_file = r'C:\Users\Administrator\Desktop\DNS恶意域名检测\2020Datacon.dns恶意域名\dns_2_question\malicious_domains_3.txt'
malicious_file = r'C:\Users\Administrator\Desktop\DNS恶意域名检测\2020Datacon.dns恶意域名\dns_2_question\malicious.txt'
benigns_file = r'C:\Users\Administrator\Desktop\DNS恶意域名检测\2020Datacon.dns恶意域名\dns_2_question\benigns.txt'
with open(word_file, 'r') as f1: #读取单词文件,建立匹配模型
word_data = f1.read().replace('\n', '|')
print(word_data)
with open(pinyin_file, 'r') as f2: #读取拼音文件,建立匹配模型
pinyin_data = f2.read().replace('\n', '|')
print(pinyin_data)
word_model = re.compile(word_data)
pinyin_data = '(' + pinyin_data + '){2}'
print('pinyin data: ', pinyin_data)
pinyin_model = re.compile('(' + pinyin_data + '){2,}') #拼音匹配两次及以上
with open(domains_file, 'r') as f3: #读取域名文件
domain_data = f3.readlines()
for domain in domain_data:
domains.append(domain.rstrip())
for domain in domains: #判断域名内是否存在英文单词或2个及以上的拼音
if (word_model.search(domain) or pinyin_model.search(domain)):
benigns.append(domain)
print('良性域名', domain)
if word_model.search(domain):
print('匹配到单词', word_model.search(domain))
elif pinyin_model.search(domain):
print('匹配到拼音', pinyin_model.search(domain))
print('--------------')
else:
malicious.append(domain)
print('恶意域名', domain)
print('--------------')
# for domain in domains:
# if word_model.search(domain):
# print('匹配到单词: ',word_model.search(domain), '域名:', domain )
for domain in domains:
with open(malicious_file, 'a') as f1:
f1.write(domain + '\n')
for domain in benigns:
with open(benigns_file, 'a') as f2:
f2.write(domain + '\n')