-
Notifications
You must be signed in to change notification settings - Fork 33
/
Copy pathyabin.py
326 lines (263 loc) · 10.9 KB
/
yabin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
'''
YaraBin (Yara + Binary)
This generates Yara rules from function prologs, for matching and hunting
Questions of comments? Hit me up @chrisdoman
'''
import binascii
import re
import os
import argparse
import math
import sqlite3
import hashlib
# What percent overlap required for two malware samples considered to be the same family?
# From 0 (0%) to 1 (100%). A large number means a tighter yara signature
# will be created
percent_tight_match = 0.8
seen_patterns = {}
conn = sqlite3.connect('db.db')
db = conn.cursor()
db.execute('PRAGMA synchronous=OFF')
def parseArguments():
parser = argparse.ArgumentParser(
description='Yabin - Signatures and searches malware')
parser.add_argument('-y', '--yara', help='Generate yara rule for the file or folder', required=False)
parser.add_argument('-yh', '--yaraHunt', help='Generate wide yara rule (any of, not all of).\r\n Useful for hunting for related samples or potentially malicious files that share any of the code - but liable to false positive', required=False)
parser.add_argument('-d', '--deleteDatabase',help='Empty the whitelist and malware database', action='store_true')
parser.add_argument('-w', '--addToWhitelist',help='Add a file or folder to the whitelist', required=False)
parser.add_argument('-f', '--fuzzyHash', help='Generate a fuzzy hash for the file', required=False)
parser.add_argument('-m', '--malwareAdd', help='Add malware file or folder to malware database to be searched', required=False)
parser.add_argument('-s', '--malwareSearch', help='Search for samples related to this file', required=False)
args = vars(parser.parse_args())
if args['yara']:
yara(args['yara'])
if args['yaraHunt']:
yara(args['yaraHunt'], False)
if args['deleteDatabase']:
deleteDatabase()
if args['addToWhitelist']:
addToWhitelist(args['addToWhitelist'])
if args['fuzzyHash']:
fuzzyHash(args['fuzzyHash'])
if args['malwareAdd']:
addMalware(args['malwareAdd'])
if args['malwareSearch']:
malwareSearch(args['malwareSearch'])
def getBytePatterns(filename, ignore_whitelist=False):
with open(filename, 'rb') as f:
content = f.read()
hex = binascii.hexlify(content)
# Add - every two characters so we match -xx- not x-x
hex = 'x'.join([hex[i:i + 2] for i in range(0, len(hex), 2)])
seen = {}
for match in re.findall(prolog_regex, hex):
bit = match[0].replace('x', '')
if bit not in seen:
if ignore_whitelist or not whitelisted(bit):
# Only include high entropy patterns, ie) avoid 0000000 or
# 1111111 etc.
# if entropy(bit) > 0:
seen[bit] = entropy(bit)
return seen
def loadProlog():
prolog_regex = '('
with open('regex.txt') as file:
for l in file.readlines():
line = l.strip()
if not line.startswith('#'):
if len(line) > 3:
prolog_regex += line + '|'
prolog_regex += ')'
prolog_regex = prolog_regex.replace('|)', ')')
return prolog_regex
# Get the shannon entropy of a string
def entropy(string):
prob = [float(string.count(c)) / len(string)
for c in dict.fromkeys(list(string))]
entropy = - sum([p * math.log(p) / math.log(2.0) for p in prob])
return entropy
def generateFuzzyHash(filename):
# Print out those that aren't in the whitelist
byte_patterns = getBytePatterns(filename)
patterns = []
for s in byte_patterns:
patterns.append(s)
patterns.sort()
# Just print the first sorted pattern... a vey poor mans fuzzy hash
for s in patterns:
print filename + ',' + s
return
def generateYara(filename, singleFile, tight=True, max_lines=3000, min_patterns=0):
global seen_patterns
global percent_tight_match
# Print out those that aren't in the whitelist
byte_patterns = getBytePatterns(filename)
if tight:
# Dont print the same rule twice
if str(byte_patterns) not in seen_patterns:
seen_patterns[str(byte_patterns)] = 1
# If we have no, or only one pattern, it probably won't be a tight
# enough signature
if len(byte_patterns) > min_patterns:
print 'rule tight_' + filename.replace('/', '_').replace('.', '') + ' {'
print ' strings:'
count = 1
for s in byte_patterns:
if count < max_lines:
count = count + 1
print ' $a_' + str(count) + ' = { ' + s + ' }'
print ' condition:'
tight_decimal = int(round(count * percent_tight_match))
print ' ' + str(tight_decimal) + ' of them'
print '}'
print '\r\n\r\n'
# if not tight:
# # Dont print the same rule twice
# for s in byte_patterns.iteritems():
# if s in seen_patterns or singleFile:
# if s not in seen_patterns:
# seen_patterns[s] = 1
# seen_patterns[s] = seen_patterns[s] + 1
# if seen_patterns[s] == 2:
# fname = filename.replace('/', '_').replace('.', '')
# print 'rule ' + fname + '_hunt_' + s[0] + ' {'
# print ' // File: ' + fname
# print ' strings:'
# print ' $a_1 = { ' + s[0] + ' }'
# print ' condition:'
# print ' all of them'
# print '}'
# print '\r\n\r\n'
# else:
# if s not in seen_patterns:
# seen_patterns[s] = 1
if not tight:
# Dont print the same rule twice
if str(byte_patterns) not in seen_patterns:
seen_patterns[str(byte_patterns)] = 1
# If we have no, or only one pattern, it probably won't be a tight
# enough signature
if len(byte_patterns) > min_patterns:
print 'rule tight_' + filename.replace('/', '_').replace('.', '') + ' {'
print ' strings:'
count = 1
for s in byte_patterns:
if count < max_lines:
count = count + 1
print ' $a_' + str(count) + ' = { ' + s + ' }'
print ' condition:'
tight_decimal = int(round(count * percent_tight_match))
print ' any of them'
print '}'
print '\r\n\r\n'
def fuzzyHash(filename, tight=True):
if os.path.isdir(filename):
for f in os.listdir(filename):
generateFuzzyHash('./' + filename + '/' + f)
else:
if os.path.isfile(filename):
generateFuzzyHash(filename)
def yara(filename, tight=True):
if os.path.isdir(filename):
for f in os.listdir(filename):
generateYara('./' + filename + '/' + f, False, tight)
else:
if os.path.isfile(filename):
generateYara(filename, True, tight)
# Returns true if a pattern is whitelisted
def whitelisted(pattern):
db.execute('SELECT * FROM whitelist WHERE pattern ="' + pattern + '"')
result = db.fetchone()
if result == None:
return False
return True
def addToWhitelist(folder):
# Minimum number of samples a pattern must be in
min_seen = 1
count = 0
# If we dont care how often it's been seen, just insert it
if min_seen == 0:
for f in os.listdir(folder):
count = count + 1
print 'Processed ' + str(count) + ' file(s)'
print 'Processing ' + f
new_seen = getBytePatterns('./' + folder + '/' + f, True)
for pattern in new_seen:
db.execute(
'insert or ignore into whitelist (pattern) values ("' + pattern + '")')
conn.commit()
# Otherwise actually keep track of how many samples a pattern has been in
else:
seen = {}
# Built a count of how often every pattern was seen
for f in os.listdir(folder):
count = count + 1
print 'Processed ' + str(count) + ' file(s)'
new_seen = getBytePatterns('./' + folder + '/' + f, True)
for pattern in new_seen:
if pattern not in seen:
seen[pattern] = 1
else:
seen[pattern] = seen[pattern] + 1
total = 0
# Insert every pattern seen > x times into the whtelist
for pattern, count in seen.iteritems():
if count > min_seen:
total = total + 1
db.execute(
'insert or ignore into whitelist (pattern) values ("' + pattern + '")')
conn.commit()
def generateSample(filename):
md5 = hashlib.md5(open(filename, 'rb').read()).hexdigest()
# Print out those that aren't in the whitelist
byte_patterns = getBytePatterns(filename)
for pattern in byte_patterns:
db.execute('insert or ignore into malware (pattern, md5) values ("' +
pattern + '", "' + md5 + '")')
def deleteDatabase():
db.execute('DROP TABLE IF EXISTS whitelist')
db.execute('DROP TABLE IF EXISTS malware')
db.execute('CREATE TABLE whitelist (pattern text)')
db.execute('CREATE UNIQUE INDEX whitelist_index on whitelist (pattern)')
db.execute('CREATE TABLE malware (pattern text, md5 text)')
db.execute('CREATE UNIQUE INDEX malware_index on malware (pattern, md5)')
# Add a file or folder to malware db
def addMalware(filename):
print 'Adding samples to malware database'
if os.path.isdir(filename):
for f in os.listdir(filename):
generateSample('./' + filename + '/' + f)
else:
if os.path.isfile(filename):
generateSample(filename)
conn.commit()
print 'Added samples'
# For every pattern in file, find related
def malwareSearch(filename):
md5 = hashlib.md5(open(filename,'rb').read()).hexdigest()
pattern_lookups = {}
found_samples = set()
# Print out those that aren't in the whitelist
byte_patterns = getBytePatterns(filename)
for pattern in byte_patterns:
related_samples = findRelated(pattern)
for sample in related_samples:
if sample not in found_samples and sample != md5:
found_samples.add(sample)
pattern_lookups[sample] = pattern
if len(found_samples) > 0:
print 'Found related samples:'
for sample in found_samples:
print sample + ' matched via ' + pattern_lookups[sample]
else:
print 'No related samples found'
def findRelated(pattern):
db.execute('SELECT md5 FROM malware WHERE pattern ="' + pattern + '"')
rows = db.fetchall()
toReturn = []
for row in rows:
toReturn.append(row[0])
return toReturn
# This regex decides what patterns we will extract
prolog_regex = loadProlog()
parseArguments()